6 package de.uni_hamburg.corpora.validation;
13 import java.io.IOException;
14 import java.nio.file.Paths;
16 import java.util.HashSet;
18 import java.util.HashMap;
19 import java.util.List;
20 import java.util.Collection;
21 import java.util.logging.Level;
22 import java.util.logging.Logger;
23 import javax.xml.parsers.ParserConfigurationException;
24 import org.apache.commons.lang.StringUtils;
25 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
26 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
27 import org.exmaralda.partitureditor.jexmaralda.BasicBody;
28 import org.exmaralda.partitureditor.jexmaralda.Tier;
29 import org.jdom.Document;
30 import org.jdom.Element;
31 import org.jdom.JDOMException;
32 import org.jdom.xpath.XPath;
33 import org.xml.sax.SAXException;
35 import java.net.URISyntaxException;
36 import java.security.NoSuchAlgorithmException;
37 import javax.xml.transform.TransformerException;
38 import javax.xml.xpath.XPathExpressionException;
39 import org.exmaralda.partitureditor.fsm.FSMException;
48 private Element communication;
49 private Element basTrans;
50 private Element segTrans;
52 private String comafilename;
53 private File comafile;
54 private String comadirname;
55 final String NSLC =
"nslc";
70 }
catch (JexmaraldaException je) {
72 }
catch (JDOMException jdome) {
74 }
catch (SAXException saxe) {
76 }
catch (IOException ioe) {
85 Document nganasanCorpus
86 = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(comafilename);
87 XPath xpCommunications = XPath.newInstance(
"//Communication");
88 List allCommunications = xpCommunications.selectNodes(nganasanCorpus);
89 for (Object o : allCommunications) {
90 communication = (Element) o;
92 String communicationName = communication.getAttributeValue(
"Name");
94 XPath xpBasTrans = XPath.newInstance(
"Transcription[Description" 95 +
"/Key[@Name='segmented']/text()='false']");
96 List allBasTrans = xpBasTrans.selectNodes(communication);
97 for (Object oB : allBasTrans) {
98 basTrans = (Element) oB;
99 String relPath = basTrans.getChildText(
"NSLink");
100 String filePath = comadirname + File.separator + relPath;
101 File file =
new File(filePath);
102 if (!file.isFile()) {
104 "Basic transcription file doesn't exist at " 105 +
"NSLink for " + communicationName);
106 }
else if (Paths.get(relPath).isAbsolute()) {
108 "Basic transcription NSLink is absolute for " 109 + communicationName);
110 }
else if (!relPath.endsWith(communicationName +
".exb")) {
112 "Wrong basic transcription NSLink for " 113 + communicationName);
116 "Basic transcription NSLink OK: " 117 + communicationName);
119 String basTransName = basTrans.getChildText(
"Name");
120 if (!basTransName.equals(communicationName)) {
122 "Wrong basic transcription name for " 123 + communicationName, basTransName +
" should be " 124 + communicationName);
127 "Basic transcription name OK for " 128 + communicationName +
": " + basTransName);
130 if (!basTrans.getChildText(
"Filename").equals(
131 communicationName +
".exb")) {
133 "Wrong basic transcripton filename for " 134 + communicationName);
137 "Correct Filename for basic transcription " 138 + communicationName);
141 XPath xpSegTrans = XPath.newInstance(
142 "Transcription[Description" 143 +
"/Key[@Name='segmented']/text()='true']");
144 List allSegTrans = xpSegTrans.selectNodes(communication);
145 for (Object oS : allSegTrans) {
146 segTrans = (Element) oS;
147 String relPath = segTrans.getChildText(
"NSLink");
148 String filePath = comadirname + File.separator + relPath;
149 File file =
new File(filePath);
150 if (!file.isFile()) {
152 "Segmented transcription file doesn't exist at" 153 +
" NSLink for " + communicationName);
154 }
else if (Paths.get(
"relPath").isAbsolute()) {
156 "Segmented transcription NSLink is absolute for " 157 + communicationName);
158 }
else if (!relPath.endsWith(communicationName +
"_s.exs")) {
160 "Wrong segmented transcription NSLink for " 161 + communicationName, relPath +
" should end in " 162 + communicationName +
"_s.exs");
165 "Correct segmented transcription NSLink for " 166 + communicationName);
168 String segTransName = segTrans.getChildText(
"Name");
169 if (!segTransName.equals(communicationName)) {
171 "Wrong segmented transcription name for " 172 + communicationName, segTransName +
" should be " 173 + communicationName);
174 }
else if (!segTrans.getChildText(
"Filename").equals(
175 communicationName +
"_s.exs")) {
177 "Wrong segmented transcription filename for " 179 segTrans.getChildText(
"Filename")
180 +
" should be " + communicationName +
"_s.exs");
183 "Correct Filename for segmented transcription " 184 + communicationName);
187 XPath xpRec = XPath.newInstance(
"Recording/Media");
188 List allRec = xpRec.selectNodes(communication);
189 for (Object oR : allRec) {
190 Element media = (Element) oR;
191 rec = media.getParentElement();
192 String relPath = media.getChildText(
"NSLink");
193 String filePath = comadirname + File.separator + relPath;
194 File file =
new File(filePath);
195 if (!file.isFile()) {
197 "Recording file doesn't exist at NSLink for " 198 + communicationName);
199 }
else if (Paths.get(
"relPath").isAbsolute()) {
201 "Recording NSLink is absolute for " 202 + communicationName);
203 }
else if (!StringUtils.substringBefore(relPath,
".").endsWith(
204 communicationName)) {
206 "Wrong recording NSLink for " 208 StringUtils.substringBefore(relPath,
".")
209 +
" should end with " + communicationName);
212 "Recording NSLink is correct for " 213 + communicationName);
215 String recName = rec.getChildText(
"Name");
216 if (!recName.equals(communicationName)) {
218 "Wrong recording name for " + communicationName,
219 recName +
" should be " + communicationName);
336 SAXException, JDOMException, IOException, JexmaraldaException {
337 Map<String, String> obligatoryTiers =
new HashMap<String, String>();
338 Map<String, String> optionalTiers =
new HashMap<String, String>();
339 obligatoryTiers.put(
"ref",
"Name of the communication");
340 optionalTiers.put(
"st",
"Source texts: normally in Cyrillic " 341 +
"transliteration");
342 obligatoryTiers.put(
"ts",
"Transcription (what is heard)");
343 obligatoryTiers.put(
"tx",
"Tier for interlinearization)");
344 obligatoryTiers.put(
"mb",
"Morpheme break");
345 obligatoryTiers.put(
"mp",
"Morphophonemes, underlying forms");
346 obligatoryTiers.put(
"gr",
"Morphological annotation: Russian gloss of " 348 obligatoryTiers.put(
"ge",
"Morphological annotation: Egnlish gloss of " 350 obligatoryTiers.put(
"mc",
"Part of speech of each morpheme");
351 obligatoryTiers.put(
"ps",
"Part of speech of each word");
352 obligatoryTiers.put(
"SeR",
"Annotation of semantic roles");
353 obligatoryTiers.put(
"SyF",
"Annotation of syntactic function");
354 optionalTiers.put(
"IST",
"Annotation of information status");
355 optionalTiers.put(
"CW",
"Annotation of code switching");
356 obligatoryTiers.put(
"fr",
"Russian free translation");
357 optionalTiers.put(
"fe",
"English free translation");
358 optionalTiers.put(
"fh",
"Hungarian free translation");
359 optionalTiers.put(
"so",
"Source origin");
360 optionalTiers.put(
"fg",
"German free translation");
361 optionalTiers.put(
"nt",
"Notes on the text unit");
362 Map<String, String> tierTypes =
new HashMap<String, String>();
363 tierTypes.put(
"ref",
"d");
364 tierTypes.put(
"st",
"d");
365 tierTypes.put(
"ts",
"d");
366 tierTypes.put(
"tx",
"t");
367 tierTypes.put(
"mb",
"a");
368 tierTypes.put(
"mp",
"a");
369 tierTypes.put(
"gr",
"a");
370 tierTypes.put(
"ge",
"a");
371 tierTypes.put(
"mc",
"a");
372 tierTypes.put(
"ps",
"a");
373 tierTypes.put(
"SeR",
"a");
374 tierTypes.put(
"SyF",
"a");
375 tierTypes.put(
"IST",
"a");
376 tierTypes.put(
"CW",
"a");
377 tierTypes.put(
"fr",
"d");
378 tierTypes.put(
"fe",
"d");
379 tierTypes.put(
"fg",
"d");
380 tierTypes.put(
"nt",
"d");
381 tierTypes.put(
"so",
"d");
382 tierTypes.put(
"fh",
"d");
385 Document nganasanCorpus
386 = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(comafilename);
387 XPath xpCommunications = XPath.newInstance(
"//Communication");
388 List allCommunications = xpCommunications.selectNodes(nganasanCorpus);
389 Set<String> skipTiers =
new HashSet<String>();
390 skipTiers.add(
"COLUMN-LABEL");
391 skipTiers.add(
"ROW-LABEL");
392 skipTiers.add(
"SUB-ROW-LABEL");
393 skipTiers.add(
"EMPTY");
394 skipTiers.add(
"EMPTY-EDITOR");
396 for (Object o : allCommunications) {
397 communication = (Element) o;
399 String communicationName = communication.getAttributeValue(
"Name");
401 XPath xpBasTrans = XPath.newInstance(
"Transcription[Description" 402 +
"/Key[@Name='segmented']/text()='false']");
403 List allBasTrans = xpBasTrans.selectNodes(communication);
404 for (Object oB : allBasTrans) {
405 basTrans = (Element) oB;
406 String relPath = basTrans.getChildText(
"NSLink");
407 String filePath = comadirname + File.separator + relPath;
408 File file =
new File(filePath);
409 if (!file.isFile()) {
413 Set<String> obligatoriesSeen =
new HashSet<String>();
414 Set<String> optionalsSeen =
new HashSet<String>();
415 Element desc = basTrans.getChild(
"Description");
416 BasicTranscription bt =
new BasicTranscription(filePath);
417 BasicBody bb = bt.getBody();
418 String[] tierIDs = bb.getAllTierIDs();
419 for (String tierID : tierIDs) {
420 if (skipTiers.contains(tierID)) {
422 "Skipped a tier: " + tierID,
423 "This tier does not need to be included in " 429 tier = bb.getTierWithID(tierID);
430 }
catch (JexmaraldaException je) {
433 exmaError.addError(NSLC, comadirname + relPath, tierID,
"",
false,
"ERROR: tier with ID " + tierID
437 String displayName = tier.getDisplayName();
438 String category = tier.getCategory();
439 String tierType = tier.getType();
440 if (obligatoryTiers.containsKey(category)) {
441 obligatoriesSeen.add(category);
442 }
else if (optionalTiers.containsKey(category)) {
443 optionalsSeen.add(category);
446 "Unrecognised tier name: " 448 exmaError.addError(NSLC, comadirname + relPath, tierID,
"",
false,
"Unrecognised tier name: " 452 if (tierTypes.containsKey(category)) {
453 if (!tierTypes.get(category).equals(tierType)) {
455 "Wrong tier type for: " 456 + tierID,
"Switch to annotation or " 457 +
" description tier");
458 exmaError.addError(NSLC, comadirname + relPath, tierID,
"",
false,
"Wrong tier type for: " 463 "Correct tier type for: " + tierID);
467 "Not known if tier: " 468 + tierID +
" should be annotation or " 470 exmaError.addError(NSLC, comadirname + relPath, tierID,
"",
false,
"Not known if tier: " 471 + tierID +
" should be annotation or " 475 if (!category.equals(tierID)) {
477 "Tier ID should match category, " 478 +
"but " + tierID +
" is not " + category);
479 exmaError.addError(NSLC, comadirname + relPath, tierID,
"",
false,
"Tier ID should match category, " 480 +
"but " + tierID +
" is not " + category);
484 for (Map.Entry<String, String> entry : obligatoryTiers.entrySet()) {
485 boolean found =
false;
486 for (String seen : obligatoriesSeen) {
487 if (entry.getKey().equals(seen)) {
493 "Missing required tier: " 494 + entry.getKey() +
": " + entry.getValue());
495 exmaError.addError(NSLC, comadirname + relPath,
"",
"",
false,
"Missing required tier: " 496 + entry.getKey() +
": " + entry.getValue());
504 public static void main(String[] args) {
510 }
catch (JDOMException ex) {
511 ex.printStackTrace();
512 }
catch (IOException ex) {
513 ex.printStackTrace();
527 }
catch (JexmaraldaException je) {
529 }
catch (JDOMException jdome) {
531 }
catch (SAXException saxe) {
533 }
catch (IOException ioe) {
535 }
catch (ParserConfigurationException ex) {
546 throws SAXException, IOException, ParserConfigurationException, JexmaraldaException, JDOMException {
548 comafilename = cd.
getURL().getFile();
549 comadirname = comafilename.substring(0, comafilename.lastIndexOf(
"/") + 1);
550 Document nganasanCorpus
551 = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(comafilename);
552 XPath xpCommunications = XPath.newInstance(
"//Communication");
553 List allCommunications = xpCommunications.selectNodes(nganasanCorpus);
554 for (Object o : allCommunications) {
555 communication = (Element) o;
557 String communicationName = communication.getAttributeValue(
"Name");
559 XPath xpBasTrans = XPath.newInstance(
"Transcription[Description" 560 +
"/Key[@Name='segmented']/text()='false']");
561 List allBasTrans = xpBasTrans.selectNodes(communication);
562 for (Object oB : allBasTrans) {
563 basTrans = (Element) oB;
564 String relPath = basTrans.getChildText(
"NSLink");
565 String filePath = comadirname + File.separator + relPath;
566 File file =
new File(filePath);
567 if (!file.isFile()) {
569 "Basic transcription file doesn't exist at " 570 +
"NSLink for " + communicationName);
571 }
else if (Paths.get(relPath).isAbsolute()) {
573 "Basic transcription NSLink is absolute for " 574 + communicationName);
575 }
else if (!relPath.endsWith(communicationName +
".exb")) {
577 "Wrong basic transcription NSLink for " 578 + communicationName);
581 "Basic transcription NSLink OK: " 582 + communicationName);
584 String basTransName = basTrans.getChildText(
"Name");
585 if (!basTransName.equals(communicationName)) {
587 "Wrong basic transcription name for " 588 + communicationName, basTransName +
" should be " 589 + communicationName);
592 "Basic transcription name OK for " 593 + communicationName +
": " + basTransName);
595 if (!basTrans.getChildText(
"Filename").equals(
596 communicationName +
".exb")) {
598 "Wrong basic transcripton filename for " 599 + communicationName);
602 "Correct Filename for basic transcription " 603 + communicationName);
606 XPath xpSegTrans = XPath.newInstance(
607 "Transcription[Description" 608 +
"/Key[@Name='segmented']/text()='true']");
609 List allSegTrans = xpSegTrans.selectNodes(communication);
610 for (Object oS : allSegTrans) {
611 segTrans = (Element) oS;
612 String relPath = segTrans.getChildText(
"NSLink");
613 String filePath = comadirname + File.separator + relPath;
614 File file =
new File(filePath);
615 if (!file.isFile()) {
617 "Segmented transcription file doesn't exist at" 618 +
" NSLink for " + communicationName);
619 }
else if (Paths.get(
"relPath").isAbsolute()) {
621 "Segmented transcription NSLink is absolute for " 622 + communicationName);
623 }
else if (!relPath.endsWith(communicationName +
"_s.exs")) {
625 "Wrong segmented transcription NSLink for " 626 + communicationName, relPath +
" should end in " 627 + communicationName +
"_s.exs");
630 "Correct segmented transcription NSLink for " 631 + communicationName);
633 String segTransName = segTrans.getChildText(
"Name");
634 if (!segTransName.equals(communicationName)) {
636 "Wrong segmented transcription name for " 637 + communicationName, segTransName +
" should be " 638 + communicationName);
639 }
else if (!segTrans.getChildText(
"Filename").equals(
640 communicationName +
"_s.exs")) {
642 "Wrong segmented transcription filename for " 644 segTrans.getChildText(
"Filename")
645 +
" should be " + communicationName +
"_s.exs");
648 "Correct Filename for segmented transcription " 649 + communicationName);
652 XPath xpRec = XPath.newInstance(
"Recording/Media");
653 List allRec = xpRec.selectNodes(communication);
654 for (Object oR : allRec) {
655 Element media = (Element) oR;
656 rec = media.getParentElement();
657 String relPath = media.getChildText(
"NSLink");
658 String filePath = comadirname + File.separator + relPath;
659 File file =
new File(filePath);
660 if (!file.isFile()) {
662 "Recording file doesn't exist at NSLink for " 663 + communicationName);
664 }
else if (Paths.get(
"relPath").isAbsolute()) {
666 "Recording NSLink is absolute for " 667 + communicationName);
668 }
else if (!StringUtils.substringBefore(relPath,
".").endsWith(
669 communicationName)) {
671 "Wrong recording NSLink for " 673 StringUtils.substringBefore(relPath,
".")
674 +
" should end with " + communicationName);
677 "Recording NSLink is correct for " 678 + communicationName);
680 String recName = rec.getChildText(
"Name");
681 if (!recName.equals(communicationName)) {
683 "Wrong recording name for " + communicationName,
684 recName +
" should be " + communicationName);
699 Class cl = Class.forName(
"de.uni_hamburg.corpora.ComaData");
701 }
catch (ClassNotFoundException ex) {
713 String description =
"This class is the check procedure for the Nganasan" 714 +
" Corpus and checks if the file names in the corpus comply with" 720 public Report function(
CorpusData cd, Boolean fix)
throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
721 throw new UnsupportedOperationException(
"Not supported yet.");
725 public Report function(
Corpus c, Boolean fix)
throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
726 throw new UnsupportedOperationException(
"Not supported yet.");
void addNote(String statId, String description)
Report check(CorpusData cd)
Collection< Class<?extends CorpusData > > getIsUsableFor()
NgexmaraldaCorpusChecker()
static ExmaErrorList exmaError
void addCritical(String description)
void addWarning(String statId, String description)
void addCorrect(String statId, String description)
Report exceptionalCheck()
static void main(String[] args)
void addException(Throwable e, String description)
Report requireObligatoryAnnotationTiersAndTypes()