1 package de.uni_hamburg.corpora.validation;
9 import java.io.IOException;
10 import java.net.URISyntaxException;
11 import java.security.NoSuchAlgorithmException;
12 import java.util.ArrayList;
13 import java.util.Collection;
14 import java.util.HashMap;
15 import java.util.logging.Level;
16 import java.util.logging.Logger;
17 import javax.xml.parsers.DocumentBuilder;
18 import javax.xml.parsers.DocumentBuilderFactory;
19 import javax.xml.parsers.ParserConfigurationException;
20 import javax.xml.transform.TransformerException;
21 import javax.xml.xpath.XPathExpressionException;
22 import org.exmaralda.partitureditor.fsm.FSMException;
23 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
24 import org.jdom.JDOMException;
25 import org.w3c.dom.Document;
26 import org.w3c.dom.Element;
27 import org.w3c.dom.NodeList;
28 import org.xml.sax.SAXException;
43 HashMap<String, HashMap<String, String>> annotations;
44 HashMap<String, HashMap<String, String>> events;
45 HashMap<String, HashMap<String, String>> eventsTwo;
46 HashMap<String, HashMap<String, Float>> tlItems;
47 HashMap<String, HashMap<String, Float>> tlItemsTwo;
48 HashMap<String, HashMap<String, HashMap<String, String>>> speakerTables;
49 HashMap<String, HashMap<String, HashMap<String, String>>> speakerTablesTwo;
70 stats = exceptionalCheck(cd);
71 }
catch (ParserConfigurationException pce) {
72 stats.
addException(pce, exbLoc +
": Unknown parsing error");
73 }
catch (SAXException saxe) {
74 stats.
addException(saxe, exbLoc +
": Unknown parsing error");
75 }
catch (IOException ioe) {
76 stats.
addException(ioe, exbLoc +
": Unknown file reading error");
77 }
catch (TransformerException ex) {
78 Logger.getLogger(
ExbMerger.class.getName()).log(Level.SEVERE, null, ex);
79 }
catch (XPathExpressionException ex) {
80 Logger.getLogger(
ExbMerger.class.getName()).log(Level.SEVERE, null, ex);
92 throws SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException {
94 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
95 DocumentBuilder db = dbf.newDocumentBuilder();
97 String transcriptName;
98 if (doc.getElementsByTagName(
"transcription-name").getLength() > 0) {
99 transcriptName = doc.getElementsByTagName(
"transcription-name").item(0).getTextContent();
101 transcriptName =
"No Name Transcript";
104 NodeList tiers = doc.getElementsByTagName(
"tier");
105 NodeList items = doc.getElementsByTagName(
"tli");
106 NodeList speakers = doc.getElementsByTagName(
"speaker");
109 if (events == null) {
110 events =
new HashMap<>();
112 if (tlItems == null) {
113 tlItems =
new HashMap<>();
115 if (speakerTables == null) {
116 speakerTables =
new HashMap<>();
118 if (exbStrings == null) {
119 exbStrings =
new HashMap<>();
123 if (!events.containsKey(transcriptName)) {
124 addEvents(tiers, transcriptName,
true, cd);
129 if (eventsTwo == null) {
130 eventsTwo =
new HashMap<>();
132 if (tlItemsTwo == null) {
133 tlItemsTwo =
new HashMap<>();
135 if (speakerTablesTwo == null) {
136 speakerTablesTwo =
new HashMap<>();
138 if (exbStringsTwo == null) {
139 exbStringsTwo =
new HashMap<>();
141 addEvents(tiers, transcriptName,
false, cd);
148 compareTwoExbs(exbStrings.get(transcriptName), exbStringsTwo.get(transcriptName));
154 HashMap<String, String> eventMap =
new HashMap<>();
155 for (
int i = 0; i < tiers.getLength(); i++) {
156 Element tier = (Element) tiers.item(i);
157 String tierID = tier.getAttribute(
"id");
158 NodeList eventTags = tier.getElementsByTagName(
"event");
159 for (
int j = 0; j < eventTags.getLength(); j++) {
160 Element
event = (Element) eventTags.item(j);
161 String eventStart =
event.getAttribute(
"start");
162 String eventEnd =
event.getAttribute(
"end");
163 String key = tierID +
"-" + eventStart +
"-" + eventEnd;
164 eventMap.put(key, event.getTextContent());
167 if (!eventMap.isEmpty()) {
169 events.put(transcriptName, eventMap);
171 eventsTwo.put(transcriptName, eventMap);
177 Collection<String> c =
new ArrayList<>();
178 HashMap<String, Float> h =
new HashMap<>();
179 for (
int i = 0; i < items.getLength(); i++) {
180 Element item = (Element) items.item(i);
181 String itemID = item.getAttribute(
"id");
182 Float time =
new Float(item.getAttribute(
"time"));
183 if (!h.containsKey(itemID)) {
186 stats.
addWarning(
"exb-merger",
"Exb file " + transcriptName +
" is containing the same timeline item with id " + itemID +
" multiple times");
187 System.out.println(
"Exb file " + transcriptName +
" is containing the same timeline item with id " + itemID +
" multiple times");
192 tlItems.put(transcriptName, h);
194 }
else if (!h.isEmpty()) {
195 tlItemsTwo.put(transcriptName, h);
199 public void addSpeakers(NodeList speakers, String transcriptName,
boolean first) {
200 HashMap<String, HashMap<String, String>> speakerMap =
new HashMap<>();
201 for (
int i = 0; i < speakers.getLength(); i++) {
202 HashMap<String, String> properties =
new HashMap<>();
203 Element speaker = (Element) speakers.item(i);
204 String speakerID = speaker.getAttribute(
"id");
205 String abbreviation = speaker.getElementsByTagName(
"abbreviation").item(0).getTextContent();
206 properties.put(
"abbreviation", abbreviation);
207 Element sex = (Element) speaker.getElementsByTagName(
"sex").item(0);
208 String sexValue = sex.getAttribute(
"value");
209 properties.put(
"sex", sexValue);
210 Element languagesUsed = (Element) speaker.getElementsByTagName(
"languages-used").item(0);
211 NodeList languagesUsedList = languagesUsed.getElementsByTagName(
"language");
212 String usedLanguages =
"";
213 for (
int j = 0; j < languagesUsedList.getLength(); j++) {
214 Element usedLanguage = (Element) languagesUsedList.item(j);
216 usedLanguages += usedLanguage.getAttribute(
"lang");
218 usedLanguages += (
", " + usedLanguage.getAttribute(
"lang"));
221 properties.put(
"languages-used", usedLanguages);
222 Element nativeLanguages = (Element) speaker.getElementsByTagName(
"l1").item(0);
223 NodeList nativeLanguagesList = nativeLanguages.getElementsByTagName(
"language");
224 String languagesNative =
"";
225 for (
int j = 0; j < nativeLanguagesList.getLength(); j++) {
226 Element nativeLanguage = (Element) nativeLanguagesList.item(j);
228 languagesNative += nativeLanguage.getAttribute(
"lang");
230 languagesNative += (
", " + nativeLanguage.getAttribute(
"lang"));
233 properties.put(
"native-languages", languagesNative);
234 Element foreignLanguages = (Element) speaker.getElementsByTagName(
"l2").item(0);
235 NodeList foreignLanguagesList = foreignLanguages.getElementsByTagName(
"language");
236 String languagesForeign =
"";
237 for (
int j = 0; j < foreignLanguagesList.getLength(); j++) {
238 Element foreignLanguage = (Element) foreignLanguagesList.item(j);
240 languagesForeign += foreignLanguage.getAttribute(
"lang");
242 languagesForeign +=
", " + foreignLanguage.getAttribute(
"lang");
245 properties.put(
"foreign-languages", languagesForeign);
246 NodeList udSpeakerInfo = speaker.getElementsByTagName(
"ud-information");
247 for (
int j = 0; j < udSpeakerInfo.getLength(); j++) {
248 Element udSpeakerInformation = (Element) udSpeakerInfo.item(j);
249 String attributeName = udSpeakerInformation.getAttribute(
"attribute-name");
250 String attributeValue = udSpeakerInformation.getTextContent();
251 properties.put(attributeName, attributeValue);
253 speakerMap.put(speakerID, properties);
256 if (!speakerMap.isEmpty()) {
257 speakerTables.put(transcriptName, speakerMap);
259 }
else if (!speakerMap.isEmpty()) {
260 speakerTablesTwo.put(transcriptName, speakerMap);
265 HashMap<String, String> exb = events.get(transcriptName);
266 HashMap<String, String> exbTwo = eventsTwo.get(transcriptName);
267 for (String eventKey : exbTwo.keySet()) {
268 String[] keyValues = eventKey.split(
"-");
269 String tierID = keyValues[0];
270 String eventStart = keyValues[1];
271 String eventEnd = keyValues[2];
272 if (exb.containsKey(eventKey)) {
273 if (!exb.get(eventKey).equals(exbTwo.get(eventKey))) {
274 stats.
addWarning(
"exb-merger",
"Exb file " + cd.
getURL().getFile().substring(cd.
getURL().getFile().lastIndexOf(
"/") + 1)
275 +
" is containing a different annotation for the same event (" + eventStart
276 +
") in its tier " + tierID +
" from another version of the same file! This version " 277 +
"has the annotation: " + exbTwo.get(eventKey) +
", while the other version has the annotation: " 278 + exb.get(eventKey));
279 exmaError.addError(
"exb-merger", cd.
getURL().getFile(), tierID, eventStart,
false,
280 "Exb file " + cd.
getURL().getFile().substring(cd.
getURL().getFile().lastIndexOf(
"/") + 1)
281 +
" is containing a different annotation for the same event (" + eventStart
282 +
") in its tier " + tierID +
" from another version of the same file! This version " 283 +
"has the annotation: " + exbTwo.get(eventKey) +
", while the other version has the annotation: " 284 + exb.get(eventKey));
285 System.out.println(
"Exb file " + cd.
getURL().getFile().substring(cd.
getURL().getFile().lastIndexOf(
"/") + 1)
286 +
" is containing a different annotation for the same event (" + eventStart
287 +
") in its tier " + tierID +
" from another version of the same file! This version " 288 +
"has the annotation: " + exbTwo.get(eventKey) +
", while the other version has the annotation: " 289 + exb.get(eventKey));
292 stats.
addWarning(
"exb-merger",
"Exb file " + cd.
getURL().getFile().substring(cd.
getURL().getFile().lastIndexOf(
"/") + 1)
293 +
" contains an event which starts at timeline ID: (" + eventStart
294 +
") and ends at timelineID: (" + eventEnd +
") in its tier " + tierID +
" which the other version(s) of the" 295 +
" same transcription doesn't contain!");
296 exmaError.addError(
"exb-merger", cd.
getURL().getFile(), tierID, eventStart,
false,
297 "Exb file " + cd.
getURL().getFile().substring(cd.
getURL().getFile().lastIndexOf(
"/") + 1)
298 +
" contains an event which starts at timeline ID: (" + eventStart
299 +
") and ends at timelineID: (" + eventEnd +
") in its tier " + tierID +
" which the other version(s) of the" 300 +
" same transcription doesn't contain!");
301 System.out.println(
"Exb file " + cd.
getURL().getFile().substring(cd.
getURL().getFile().lastIndexOf(
"/") + 1)
302 +
" contains an event which starts at timeline ID: (" + eventStart
303 +
") and ends at timelineID: (" + eventEnd +
") in its tier " + tierID +
" which the other version(s) of the" 304 +
" same transcription doesn't contain!");
310 HashMap<String, Float> exb = tlItems.get(transcriptName);
311 HashMap<String, Float> exbTwo = tlItemsTwo.get(transcriptName);
312 for (String
id : exbTwo.keySet()) {
313 if (exb.containsKey(
id)) {
314 if (Math.abs(exbTwo.get(
id) - exb.get(
id)) > 0.05) {
315 float shift = exbTwo.get(
id) - exb.get(
id);
316 stats.
addWarning(
"exb-merger",
"Exb file " + transcriptName +
"'s timeline has changed.");
317 stats.
addWarning(
"exb-merger",
"Exb file " + transcriptName +
"'s timeline item " +
id +
" has been shifted by " + shift +
" seconds.");
318 System.out.println(
"Exb file " + transcriptName +
"'s timeline has changed.");
319 System.out.println(
"Exb file " + transcriptName +
"'s timeline item " +
id +
" has been shifted by " + shift +
" seconds.");
322 stats.
addWarning(
"exb-merger",
"Exb file " + transcriptName +
" is not containing the same timeline item with id " +
id +
" in one of its versions.");
323 System.out.println(
"Exb file " + transcriptName +
" is not containing the same timeline item with id " +
id +
" in one of its versions.");
329 HashMap<String, HashMap<String, String>> exb = speakerTables.get(transcriptName);
330 HashMap<String, HashMap<String, String>> exbTwo = speakerTablesTwo.get(transcriptName);
331 for (String speakerID : exbTwo.keySet()) {
332 if (exb.containsKey(speakerID)) {
333 for (String property : exbTwo.get(speakerID).keySet()) {
334 if (exb.get(speakerID).containsKey(property)) {
335 String propertyValue = exbTwo.get(speakerID).get(property);
336 String propertyValueDiffVers = exb.get(speakerID).get(property);
337 if (!propertyValue.equals(propertyValueDiffVers)) {
338 stats.
addWarning(
"exb-merger",
"Exb file " + transcriptName +
" is not containing the same property value for " + property
339 +
" of the speaker with id " + speakerID +
" in one of its versions. This version has the value " 340 + propertyValue +
" whilst the other one has the value " + propertyValueDiffVers +
".");
341 System.out.println(
"Exb file " + transcriptName +
" is not containing the same property value for " + property
342 +
" of the speaker with id " + speakerID +
" in one of its versions. This version has the value " 343 + propertyValue +
" whilst the other one has the value " + propertyValueDiffVers +
".");
346 stats.
addWarning(
"exb-merger",
"Exb file " + transcriptName +
" is not containing the same property " + property
347 +
" of the speaker with id " + speakerID +
" in one of its versions.");
348 System.out.println(
"Exb file " + transcriptName +
" is not containing the same property " + property
349 +
" of the speaker with id " + speakerID +
" in one of its versions.");
353 stats.
addWarning(
"exb-merger",
"Exb file " + transcriptName +
" is not containing the same speaker with id " + speakerID +
" in one of its versions.");
354 System.out.println(
"Exb file " + transcriptName +
" is not containing the same timeline item with id " + speakerID +
" in one of its versions.");
360 String firstDifference =
new String(
new char[firstExb.length()]).replace(
'\0',
' ');
361 String secondDifference =
new String(
new char[secondExb.length()]).replace(
'\0',
' ');
362 char[] firstChars = firstDifference.toCharArray();
363 char[] secondChars = secondDifference.toCharArray();
364 String[] firstExbLines = firstExb.split(
"\n");
365 String[] secondExbLines = secondExb.split(
"\n");
366 if (firstExb.length() > secondExb.length()) {
369 for (String secondExbLine : secondExbLines) {
370 if (firstExbLines[lineCounter].length() > secondExbLine.length()) {
371 for (
int i = 0; i < secondExbLine.length(); i++) {
372 if (secondExbLine.charAt(i) != firstExbLines[lineCounter].charAt(i)) {
373 firstChars[charCounter] = firstExbLines[lineCounter].charAt(i);
374 secondChars[charCounter] = secondExbLine.charAt(i);
378 secondChars[charCounter++] =
'\n';
379 for (
int j = charCounter; j < firstExbLines[lineCounter].length(); j++) {
380 firstChars[j] = firstExbLines[lineCounter].charAt(j);
382 firstChars[charCounter++] =
'\n';
384 for (
int i = 0; i < firstExbLines[lineCounter].length(); i++) {
385 if (secondExbLine.charAt(i) != firstExbLines[lineCounter].charAt(i)) {
386 firstChars[charCounter] = firstExbLines[lineCounter].charAt(i);
387 secondChars[charCounter] = secondExbLine.charAt(i);
391 firstChars[charCounter++] =
'\n';
392 for (
int j = charCounter; j < secondExbLines[lineCounter].length(); j++) {
393 secondChars[j] = secondExbLines[lineCounter].charAt(j);
395 secondChars[charCounter++] =
'\n';
399 for (
int j = lineCounter; j < firstExbLines.length; j++) {
400 for (
int i = 0; i < firstExbLines[j].length(); i++) {
401 firstChars[charCounter] = firstExbLines[j].charAt(i);
404 firstChars[charCounter++] =
'\n';
406 firstDifference = String.valueOf(firstChars);
407 secondDifference = String.valueOf(secondChars);
411 for (String firstExbLine : firstExbLines) {
412 if (firstExbLine.length() > secondExbLines[lineCounter].length()) {
413 for (
int i = 0; i < secondExbLines[lineCounter].length(); i++) {
414 if (firstExbLine.charAt(i) != secondExbLines[lineCounter].charAt(i)) {
415 firstChars[charCounter] = firstExbLine.charAt(i);
416 secondChars[charCounter] = secondExbLines[lineCounter].charAt(i);
420 secondChars[charCounter++] =
'\n';
421 for (
int j = charCounter; j < firstExbLines[lineCounter].length(); j++) {
422 firstChars[j] = firstExbLines[lineCounter].charAt(j);
424 firstChars[charCounter++] =
'\n';
426 for (
int i = 0; i < firstExbLine.length(); i++) {
427 if (secondExbLines[lineCounter].charAt(i) != firstExbLine.charAt(i)) {
428 firstChars[charCounter] = firstExbLine.charAt(i);
429 secondChars[charCounter] = secondExbLines[lineCounter].charAt(i);
433 firstChars[charCounter++] =
'\n';
434 for (
int j = charCounter; j < secondExbLines[lineCounter].length(); j++) {
435 secondChars[j] = secondExbLines[lineCounter].charAt(j);
437 secondChars[charCounter++] =
'\n';
441 for (
int j = lineCounter; j < secondExbLines.length; j++) {
442 for (
int i = 0; i < secondExbLines[j].length(); i++) {
443 secondChars[charCounter] = secondExbLines[j].charAt(i);
446 secondChars[charCounter++] =
'\n';
448 firstDifference = String.valueOf(firstChars);
449 secondDifference = String.valueOf(secondChars);
451 String[] differences = {firstDifference, secondDifference};
463 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
465 }
catch (ClassNotFoundException ex) {
466 Logger.getLogger(
ExbMerger.class.getName()).log(Level.SEVERE, null, ex);
473 public Report function(
CorpusData cd, Boolean fix)
throws FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
474 throw new UnsupportedOperationException(
"Not supported yet.");
479 throw new UnsupportedOperationException(
"Not supported yet.");
483 public Report function(
Corpus c, Boolean fix)
throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
484 throw new UnsupportedOperationException(
"Not supported yet.");
String toSaveableString()
void addTimelineItems(NodeList items, String transcriptName, boolean first, Report stats)
static ExmaErrorList exmaError
void compareSpeakers(String transcriptName, Report stats)
void compareEvents(String transcriptName, Report stats, CorpusData cd)
Collection< Class<?extends CorpusData > > getIsUsableFor()
HashMap< String, String > exbStrings
String[] compareTwoExbs(String firstExb, String secondExb)
Report check(CorpusData cd)
HashMap< String, String > exbStringsTwo
void addWarning(String statId, String description)
void addSpeakers(NodeList speakers, String transcriptName, boolean first)
void compareTimelineItems(String transcriptName, Report stats)
void addEvents(NodeList tiers, String transcriptName, boolean first, CorpusData cd)
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)