corpus-services  1.0
ExbMerger.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora.validation;
2 
9 import java.io.IOException;
10 import java.net.URISyntaxException;
11 import java.security.NoSuchAlgorithmException;
12 import java.util.ArrayList;
13 import java.util.Collection;
14 import java.util.HashMap;
15 import java.util.logging.Level;
16 import java.util.logging.Logger;
17 import javax.xml.parsers.DocumentBuilder;
18 import javax.xml.parsers.DocumentBuilderFactory;
19 import javax.xml.parsers.ParserConfigurationException;
20 import javax.xml.transform.TransformerException;
21 import javax.xml.xpath.XPathExpressionException;
22 import org.exmaralda.partitureditor.fsm.FSMException;
23 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
24 import org.jdom.JDOMException;
25 import org.w3c.dom.Document;
26 import org.w3c.dom.Element;
27 import org.w3c.dom.NodeList;
28 import org.xml.sax.SAXException;
29 
38 public class ExbMerger extends Checker implements CorpusFunction {
39 
40  String exbLoc = "";
41  public HashMap<String, String> exbStrings;
42  public HashMap<String, String> exbStringsTwo;
43  HashMap<String, HashMap<String, String>> annotations; // hash map for holding annotations of exb files
44  HashMap<String, HashMap<String, String>> events; // hash map for holding events of exb files
45  HashMap<String, HashMap<String, String>> eventsTwo; // hash map for holding events of second exb files
46  HashMap<String, HashMap<String, Float>> tlItems; // hash map for timeline items of the exb files
47  HashMap<String, HashMap<String, Float>> tlItemsTwo; // hash map for timeline items of second exb files
48  HashMap<String, HashMap<String, HashMap<String, String>>> speakerTables; // hash map for speakers of the exb files
49  HashMap<String, HashMap<String, HashMap<String, String>>> speakerTablesTwo; // hash map for speakers of the second exb files
50  //HashMap<String, Collection<String>> distinctAnnotations; // hash map for storing distinct annots for each transcription file
51  //HashMap<String, HashMap<String, String>> annotationsTwo; // hash map for holding annotations of second exb files
52  //HashMap<String, Integer> noOfSubCategories; // hash map for holding number of subcategories for every category
53  //HashMap<String, String> subCategoryToCategory; // hash map for holding parent categories for sub categories
54  //private int noOfAnnotations = 0; // total no of annotations
55  //private int noOfDifferentAnnotations = 0; // total number of different annotations between different two different versions
56 
57  public ExbMerger() {
58  //fixing option available
59  super(false);
60  }
61 
67  public Report check(CorpusData cd) throws JexmaraldaException {
68  Report stats = new Report();
69  try {
70  stats = exceptionalCheck(cd);
71  } catch (ParserConfigurationException pce) {
72  stats.addException(pce, exbLoc + ": Unknown parsing error");
73  } catch (SAXException saxe) {
74  stats.addException(saxe, exbLoc + ": Unknown parsing error");
75  } catch (IOException ioe) {
76  stats.addException(ioe, exbLoc + ": Unknown file reading error");
77  } catch (TransformerException ex) {
78  Logger.getLogger(ExbMerger.class.getName()).log(Level.SEVERE, null, ex);
79  } catch (XPathExpressionException ex) {
80  Logger.getLogger(ExbMerger.class.getName()).log(Level.SEVERE, null, ex);
81  }
82  return stats;
83  }
84 
91  private Report exceptionalCheck(CorpusData cd)
92  throws SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException {
93  Report stats = new Report(); //create a new report
94  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
95  DocumentBuilder db = dbf.newDocumentBuilder();
96  Document doc = db.parse(TypeConverter.String2InputStream(cd.toSaveableString())); // get the file as a document
97  String transcriptName;
98  if (doc.getElementsByTagName("transcription-name").getLength() > 0) { // check if transcript name exists for the exb file
99  transcriptName = doc.getElementsByTagName("transcription-name").item(0).getTextContent(); // get transcript name
100  } else {
101  transcriptName = "No Name Transcript";
102  }
103 
104  NodeList tiers = doc.getElementsByTagName("tier"); // get all tiers of the transcript
105  NodeList items = doc.getElementsByTagName("tli"); // get all timeline items of the transcript
106  NodeList speakers = doc.getElementsByTagName("speaker"); // get all speakers from the speaker table of the transcript
107 
108  //initialise the hash map only the first time when this function is called
109  if (events == null) {
110  events = new HashMap<>();
111  }
112  if (tlItems == null) {
113  tlItems = new HashMap<>();
114  }
115  if (speakerTables == null) {
116  speakerTables = new HashMap<>();
117  }
118  if (exbStrings == null) {
119  exbStrings = new HashMap<>();
120  }
121  //if annotations hash map doesn't contain the transcript's name, it means
122  //that it is the first time a version of this file is encountered.
123  if (!events.containsKey(transcriptName)) {
124  addEvents(tiers, transcriptName, true, cd);
125  addTimelineItems(items, transcriptName, true, stats);
126  addSpeakers(speakers, transcriptName, true);
127  exbStrings.put(transcriptName, cd.toSaveableString());
128  } else { // another version of this transcript has already been encountered
129  if (eventsTwo == null) {
130  eventsTwo = new HashMap<>();
131  }
132  if (tlItemsTwo == null) {
133  tlItemsTwo = new HashMap<>();
134  }
135  if (speakerTablesTwo == null) {
136  speakerTablesTwo = new HashMap<>();
137  }
138  if (exbStringsTwo == null) {
139  exbStringsTwo = new HashMap<>();
140  }
141  addEvents(tiers, transcriptName, false, cd);
142  addTimelineItems(items, transcriptName, false, stats);
143  addSpeakers(speakers, transcriptName, false);
144  exbStringsTwo.put(transcriptName, cd.toSaveableString());
145  compareEvents(transcriptName, stats, cd);
146  compareTimelineItems(transcriptName, stats);
147  compareSpeakers(transcriptName, stats);
148  compareTwoExbs(exbStrings.get(transcriptName), exbStringsTwo.get(transcriptName));
149  }
150  return stats;
151  }
152 
153  public void addEvents(NodeList tiers, String transcriptName, boolean first, CorpusData cd) {
154  HashMap<String, String> eventMap = new HashMap<>();
155  for (int i = 0; i < tiers.getLength(); i++) { // loop for dealing with each tier
156  Element tier = (Element) tiers.item(i);
157  String tierID = tier.getAttribute("id");
158  NodeList eventTags = tier.getElementsByTagName("event");
159  for (int j = 0; j < eventTags.getLength(); j++) { // annotation events
160  Element event = (Element) eventTags.item(j);
161  String eventStart = event.getAttribute("start");
162  String eventEnd = event.getAttribute("end");
163  String key = tierID + "-" + eventStart + "-" + eventEnd;
164  eventMap.put(key, event.getTextContent());
165  }
166  }
167  if (!eventMap.isEmpty()) {
168  if (first) {
169  events.put(transcriptName, eventMap); // finally add the events of the transcript
170  } else {
171  eventsTwo.put(transcriptName, eventMap); // finally add the events of the second transcript
172  }
173  }
174  }
175 
176  public void addTimelineItems(NodeList items, String transcriptName, boolean first, Report stats) {
177  Collection<String> c = new ArrayList<>(); // collection for adding items into hash map
178  HashMap<String, Float> h = new HashMap<>();
179  for (int i = 0; i < items.getLength(); i++) { // loop for dealing with each timeline item
180  Element item = (Element) items.item(i);
181  String itemID = item.getAttribute("id");
182  Float time = new Float(item.getAttribute("time"));
183  if (!h.containsKey(itemID)) {
184  h.put(itemID, time);
185  } else {
186  stats.addWarning("exb-merger", "Exb file " + transcriptName + " is containing the same timeline item with id " + itemID + " multiple times");
187  System.out.println("Exb file " + transcriptName + " is containing the same timeline item with id " + itemID + " multiple times");
188  }
189  }
190  if (first) {
191  if (!h.isEmpty()) {
192  tlItems.put(transcriptName, h); // finally add the timeline items of the transcript
193  }
194  } else if (!h.isEmpty()) {
195  tlItemsTwo.put(transcriptName, h); // finally add the timeline items of the transcript
196  }
197  }
198 
199  public void addSpeakers(NodeList speakers, String transcriptName, boolean first) {
200  HashMap<String, HashMap<String, String>> speakerMap = new HashMap<>();
201  for (int i = 0; i < speakers.getLength(); i++) { // loop for dealing with each speaker
202  HashMap<String, String> properties = new HashMap<>();
203  Element speaker = (Element) speakers.item(i);
204  String speakerID = speaker.getAttribute("id");
205  String abbreviation = speaker.getElementsByTagName("abbreviation").item(0).getTextContent();
206  properties.put("abbreviation", abbreviation);
207  Element sex = (Element) speaker.getElementsByTagName("sex").item(0);
208  String sexValue = sex.getAttribute("value");
209  properties.put("sex", sexValue);
210  Element languagesUsed = (Element) speaker.getElementsByTagName("languages-used").item(0);
211  NodeList languagesUsedList = languagesUsed.getElementsByTagName("language");
212  String usedLanguages = "";
213  for (int j = 0; j < languagesUsedList.getLength(); j++) {
214  Element usedLanguage = (Element) languagesUsedList.item(j);
215  if (j == 0) {
216  usedLanguages += usedLanguage.getAttribute("lang");
217  } else {
218  usedLanguages += (", " + usedLanguage.getAttribute("lang"));
219  }
220  }
221  properties.put("languages-used", usedLanguages);
222  Element nativeLanguages = (Element) speaker.getElementsByTagName("l1").item(0);
223  NodeList nativeLanguagesList = nativeLanguages.getElementsByTagName("language");
224  String languagesNative = "";
225  for (int j = 0; j < nativeLanguagesList.getLength(); j++) {
226  Element nativeLanguage = (Element) nativeLanguagesList.item(j);
227  if (j == 0) {
228  languagesNative += nativeLanguage.getAttribute("lang");
229  } else {
230  languagesNative += (", " + nativeLanguage.getAttribute("lang"));
231  }
232  }
233  properties.put("native-languages", languagesNative);
234  Element foreignLanguages = (Element) speaker.getElementsByTagName("l2").item(0);
235  NodeList foreignLanguagesList = foreignLanguages.getElementsByTagName("language");
236  String languagesForeign = "";
237  for (int j = 0; j < foreignLanguagesList.getLength(); j++) {
238  Element foreignLanguage = (Element) foreignLanguagesList.item(j);
239  if (j == 0) {
240  languagesForeign += foreignLanguage.getAttribute("lang");
241  } else {
242  languagesForeign += ", " + foreignLanguage.getAttribute("lang");
243  }
244  }
245  properties.put("foreign-languages", languagesForeign);
246  NodeList udSpeakerInfo = speaker.getElementsByTagName("ud-information");
247  for (int j = 0; j < udSpeakerInfo.getLength(); j++) {
248  Element udSpeakerInformation = (Element) udSpeakerInfo.item(j);
249  String attributeName = udSpeakerInformation.getAttribute("attribute-name");
250  String attributeValue = udSpeakerInformation.getTextContent();
251  properties.put(attributeName, attributeValue);
252  }
253  speakerMap.put(speakerID, properties);
254  }
255  if (first) {
256  if (!speakerMap.isEmpty()) {
257  speakerTables.put(transcriptName, speakerMap); // finally add the timeline items of the transcript
258  }
259  } else if (!speakerMap.isEmpty()) {
260  speakerTablesTwo.put(transcriptName, speakerMap); // finally add the timeline items of the transcript
261  }
262  }
263 
264  public void compareEvents(String transcriptName, Report stats, CorpusData cd) {
265  HashMap<String, String> exb = events.get(transcriptName);
266  HashMap<String, String> exbTwo = eventsTwo.get(transcriptName);
267  for (String eventKey : exbTwo.keySet()) {
268  String[] keyValues = eventKey.split("-");
269  String tierID = keyValues[0];
270  String eventStart = keyValues[1];
271  String eventEnd = keyValues[2];
272  if (exb.containsKey(eventKey)) {
273  if (!exb.get(eventKey).equals(exbTwo.get(eventKey))) {
274  stats.addWarning("exb-merger", "Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
275  + " is containing a different annotation for the same event (" + eventStart
276  + ") in its tier " + tierID + " from another version of the same file! This version "
277  + "has the annotation: " + exbTwo.get(eventKey) + ", while the other version has the annotation: "
278  + exb.get(eventKey));
279  exmaError.addError("exb-merger", cd.getURL().getFile(), tierID, eventStart, false,
280  "Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
281  + " is containing a different annotation for the same event (" + eventStart
282  + ") in its tier " + tierID + " from another version of the same file! This version "
283  + "has the annotation: " + exbTwo.get(eventKey) + ", while the other version has the annotation: "
284  + exb.get(eventKey));
285  System.out.println("Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
286  + " is containing a different annotation for the same event (" + eventStart
287  + ") in its tier " + tierID + " from another version of the same file! This version "
288  + "has the annotation: " + exbTwo.get(eventKey) + ", while the other version has the annotation: "
289  + exb.get(eventKey));
290  }
291  } else {
292  stats.addWarning("exb-merger", "Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
293  + " contains an event which starts at timeline ID: (" + eventStart
294  + ") and ends at timelineID: (" + eventEnd + ") in its tier " + tierID + " which the other version(s) of the"
295  + " same transcription doesn't contain!");
296  exmaError.addError("exb-merger", cd.getURL().getFile(), tierID, eventStart, false,
297  "Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
298  + " contains an event which starts at timeline ID: (" + eventStart
299  + ") and ends at timelineID: (" + eventEnd + ") in its tier " + tierID + " which the other version(s) of the"
300  + " same transcription doesn't contain!");
301  System.out.println("Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
302  + " contains an event which starts at timeline ID: (" + eventStart
303  + ") and ends at timelineID: (" + eventEnd + ") in its tier " + tierID + " which the other version(s) of the"
304  + " same transcription doesn't contain!");
305  }
306  }
307  }
308 
309  public void compareTimelineItems(String transcriptName, Report stats) {
310  HashMap<String, Float> exb = tlItems.get(transcriptName);
311  HashMap<String, Float> exbTwo = tlItemsTwo.get(transcriptName);
312  for (String id : exbTwo.keySet()) {
313  if (exb.containsKey(id)) {
314  if (Math.abs(exbTwo.get(id) - exb.get(id)) > 0.05) {
315  float shift = exbTwo.get(id) - exb.get(id);
316  stats.addWarning("exb-merger", "Exb file " + transcriptName + "'s timeline has changed.");
317  stats.addWarning("exb-merger", "Exb file " + transcriptName + "'s timeline item " + id + " has been shifted by " + shift + " seconds.");
318  System.out.println("Exb file " + transcriptName + "'s timeline has changed.");
319  System.out.println("Exb file " + transcriptName + "'s timeline item " + id + " has been shifted by " + shift + " seconds.");
320  }
321  } else {
322  stats.addWarning("exb-merger", "Exb file " + transcriptName + " is not containing the same timeline item with id " + id + " in one of its versions.");
323  System.out.println("Exb file " + transcriptName + " is not containing the same timeline item with id " + id + " in one of its versions.");
324  }
325  }
326  }
327 
328  public void compareSpeakers(String transcriptName, Report stats) {
329  HashMap<String, HashMap<String, String>> exb = speakerTables.get(transcriptName);
330  HashMap<String, HashMap<String, String>> exbTwo = speakerTablesTwo.get(transcriptName);
331  for (String speakerID : exbTwo.keySet()) {
332  if (exb.containsKey(speakerID)) {
333  for (String property : exbTwo.get(speakerID).keySet()) {
334  if (exb.get(speakerID).containsKey(property)) {
335  String propertyValue = exbTwo.get(speakerID).get(property);
336  String propertyValueDiffVers = exb.get(speakerID).get(property);
337  if (!propertyValue.equals(propertyValueDiffVers)) {
338  stats.addWarning("exb-merger", "Exb file " + transcriptName + " is not containing the same property value for " + property
339  + " of the speaker with id " + speakerID + " in one of its versions. This version has the value "
340  + propertyValue + " whilst the other one has the value " + propertyValueDiffVers + ".");
341  System.out.println("Exb file " + transcriptName + " is not containing the same property value for " + property
342  + " of the speaker with id " + speakerID + " in one of its versions. This version has the value "
343  + propertyValue + " whilst the other one has the value " + propertyValueDiffVers + ".");
344  }
345  } else {
346  stats.addWarning("exb-merger", "Exb file " + transcriptName + " is not containing the same property " + property
347  + " of the speaker with id " + speakerID + " in one of its versions.");
348  System.out.println("Exb file " + transcriptName + " is not containing the same property " + property
349  + " of the speaker with id " + speakerID + " in one of its versions.");
350  }
351  }
352  } else {
353  stats.addWarning("exb-merger", "Exb file " + transcriptName + " is not containing the same speaker with id " + speakerID + " in one of its versions.");
354  System.out.println("Exb file " + transcriptName + " is not containing the same timeline item with id " + speakerID + " in one of its versions.");
355  }
356  }
357  }
358 
359  public String[] compareTwoExbs(String firstExb, String secondExb) {
360  String firstDifference = new String(new char[firstExb.length()]).replace('\0', ' ');
361  String secondDifference = new String(new char[secondExb.length()]).replace('\0', ' ');
362  char[] firstChars = firstDifference.toCharArray();
363  char[] secondChars = secondDifference.toCharArray();
364  String[] firstExbLines = firstExb.split("\n");
365  String[] secondExbLines = secondExb.split("\n");
366  if (firstExb.length() > secondExb.length()) {
367  int lineCounter = 0;
368  int charCounter = 0;
369  for (String secondExbLine : secondExbLines) {
370  if (firstExbLines[lineCounter].length() > secondExbLine.length()) {
371  for (int i = 0; i < secondExbLine.length(); i++) {
372  if (secondExbLine.charAt(i) != firstExbLines[lineCounter].charAt(i)) {
373  firstChars[charCounter] = firstExbLines[lineCounter].charAt(i);
374  secondChars[charCounter] = secondExbLine.charAt(i);
375  }
376  charCounter++;
377  }
378  secondChars[charCounter++] = '\n';
379  for (int j = charCounter; j < firstExbLines[lineCounter].length(); j++) {
380  firstChars[j] = firstExbLines[lineCounter].charAt(j);
381  }
382  firstChars[charCounter++] = '\n';
383  } else {
384  for (int i = 0; i < firstExbLines[lineCounter].length(); i++) {
385  if (secondExbLine.charAt(i) != firstExbLines[lineCounter].charAt(i)) {
386  firstChars[charCounter] = firstExbLines[lineCounter].charAt(i);
387  secondChars[charCounter] = secondExbLine.charAt(i);
388  }
389  charCounter++;
390  }
391  firstChars[charCounter++] = '\n';
392  for (int j = charCounter; j < secondExbLines[lineCounter].length(); j++) {
393  secondChars[j] = secondExbLines[lineCounter].charAt(j);
394  }
395  secondChars[charCounter++] = '\n';
396  }
397  lineCounter++;
398  }
399  for (int j = lineCounter; j < firstExbLines.length; j++) {
400  for (int i = 0; i < firstExbLines[j].length(); i++) {
401  firstChars[charCounter] = firstExbLines[j].charAt(i);
402  charCounter++;
403  }
404  firstChars[charCounter++] = '\n';
405  }
406  firstDifference = String.valueOf(firstChars);
407  secondDifference = String.valueOf(secondChars);
408  } else {
409  int lineCounter = 0;
410  int charCounter = 0;
411  for (String firstExbLine : firstExbLines) {
412  if (firstExbLine.length() > secondExbLines[lineCounter].length()) {
413  for (int i = 0; i < secondExbLines[lineCounter].length(); i++) {
414  if (firstExbLine.charAt(i) != secondExbLines[lineCounter].charAt(i)) {
415  firstChars[charCounter] = firstExbLine.charAt(i);
416  secondChars[charCounter] = secondExbLines[lineCounter].charAt(i);
417  }
418  charCounter++;
419  }
420  secondChars[charCounter++] = '\n';
421  for (int j = charCounter; j < firstExbLines[lineCounter].length(); j++) {
422  firstChars[j] = firstExbLines[lineCounter].charAt(j);
423  }
424  firstChars[charCounter++] = '\n';
425  } else {
426  for (int i = 0; i < firstExbLine.length(); i++) {
427  if (secondExbLines[lineCounter].charAt(i) != firstExbLine.charAt(i)) {
428  firstChars[charCounter] = firstExbLine.charAt(i);
429  secondChars[charCounter] = secondExbLines[lineCounter].charAt(i);
430  }
431  charCounter++;
432  }
433  firstChars[charCounter++] = '\n';
434  for (int j = charCounter; j < secondExbLines[lineCounter].length(); j++) {
435  secondChars[j] = secondExbLines[lineCounter].charAt(j);
436  }
437  secondChars[charCounter++] = '\n';
438  }
439  lineCounter++;
440  }
441  for (int j = lineCounter; j < secondExbLines.length; j++) {
442  for (int i = 0; i < secondExbLines[j].length(); i++) {
443  secondChars[charCounter] = secondExbLines[j].charAt(i);
444  charCounter++;
445  }
446  secondChars[charCounter++] = '\n';
447  }
448  firstDifference = String.valueOf(firstChars);
449  secondDifference = String.valueOf(secondChars);
450  }
451  String[] differences = {firstDifference, secondDifference};
452  return differences;
453  }
454 
460  @Override
461  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
462  try {
463  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
464  IsUsableFor.add(cl);
465  } catch (ClassNotFoundException ex) {
466  Logger.getLogger(ExbMerger.class.getName()).log(Level.SEVERE, null, ex);
467  }
468  return IsUsableFor;
469  }
470 
471 
472  @Override
473  public Report function(CorpusData cd, Boolean fix) throws FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
474  throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
475  }
476 
477  @Override
478  public String getDescription() {
479  throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
480  }
481 
482  @Override
483  public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
484  throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
485  }
486 }
void addTimelineItems(NodeList items, String transcriptName, boolean first, Report stats)
Definition: ExbMerger.java:176
void compareSpeakers(String transcriptName, Report stats)
Definition: ExbMerger.java:328
void compareEvents(String transcriptName, Report stats, CorpusData cd)
Definition: ExbMerger.java:264
Collection< Class<?extends CorpusData > > getIsUsableFor()
Definition: ExbMerger.java:461
HashMap< String, String > exbStrings
Definition: ExbMerger.java:41
String[] compareTwoExbs(String firstExb, String secondExb)
Definition: ExbMerger.java:359
HashMap< String, String > exbStringsTwo
Definition: ExbMerger.java:42
void addWarning(String statId, String description)
Definition: Report.java:164
void addSpeakers(NodeList speakers, String transcriptName, boolean first)
Definition: ExbMerger.java:199
void compareTimelineItems(String transcriptName, Report stats)
Definition: ExbMerger.java:309
void addEvents(NodeList tiers, String transcriptName, boolean first, CorpusData cd)
Definition: ExbMerger.java:153
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)
Definition: Report.java:287