corpus-services  1.0
NgexmaraldaCorpusChecker.java
Go to the documentation of this file.
1 /*
2  * @file NgexmaraldaCorpusChecker.java
3  *
4  * Nganasan Spoken Language Corpus specific checkers.
5  */
6 package de.uni_hamburg.corpora.validation;
7 
12 import java.io.File;
13 import java.io.IOException;
14 import java.nio.file.Paths;
15 import java.util.Set;
16 import java.util.HashSet;
17 import java.util.Map;
18 import java.util.HashMap;
19 import java.util.List;
20 import java.util.Collection;
21 import java.util.logging.Level;
22 import java.util.logging.Logger;
23 import javax.xml.parsers.ParserConfigurationException;
24 import org.apache.commons.lang.StringUtils;
25 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
26 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
27 import org.exmaralda.partitureditor.jexmaralda.BasicBody;
28 import org.exmaralda.partitureditor.jexmaralda.Tier;
29 import org.jdom.Document;
30 import org.jdom.Element;
31 import org.jdom.JDOMException;
32 import org.jdom.xpath.XPath;
33 import org.xml.sax.SAXException;
35 import java.net.URISyntaxException;
36 import java.security.NoSuchAlgorithmException;
37 import javax.xml.transform.TransformerException;
38 import javax.xml.xpath.XPathExpressionException;
39 import org.exmaralda.partitureditor.fsm.FSMException;
40 
46 public class NgexmaraldaCorpusChecker extends Checker implements CorpusFunction {
47 
48  private Element communication;
49  private Element basTrans;
50  private Element segTrans;
51  private Element rec;
52  private String comafilename;
53  private File comafile;
54  private String comadirname;
55  final String NSLC = "nslc";
56 
58 
62  super(false);
63  }
64 
65  public Report check() {
66  Report stats = new Report();
67  try {
68  stats = exceptionalCheck();
70  } catch (JexmaraldaException je) {
71  stats.addException(je, "Unknown parsing error");
72  } catch (JDOMException jdome) {
73  stats.addException(jdome, "Unknown parsing error");
74  } catch (SAXException saxe) {
75  stats.addException(saxe, "Unknown parsing error");
76  } catch (IOException ioe) {
77  stats.addException(ioe, "Reading error");
78  }
79  return stats;
80  }
81 
82  public Report exceptionalCheck() throws JDOMException,
83  IOException {
84  Report stats = new Report();
85  Document nganasanCorpus
86  = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(comafilename);
87  XPath xpCommunications = XPath.newInstance("//Communication");
88  List allCommunications = xpCommunications.selectNodes(nganasanCorpus);
89  for (Object o : allCommunications) {
90  communication = (Element) o;
91  //retrieve the communication name
92  String communicationName = communication.getAttributeValue("Name");
93  //pick up basic transcriptions
94  XPath xpBasTrans = XPath.newInstance("Transcription[Description"
95  + "/Key[@Name='segmented']/text()='false']");
96  List allBasTrans = xpBasTrans.selectNodes(communication);
97  for (Object oB : allBasTrans) {
98  basTrans = (Element) oB;
99  String relPath = basTrans.getChildText("NSLink");
100  String filePath = comadirname + File.separator + relPath;
101  File file = new File(filePath);
102  if (!file.isFile()) {
103  stats.addCritical(NSLC,
104  "Basic transcription file doesn't exist at "
105  + "NSLink for " + communicationName);
106  } else if (Paths.get(relPath).isAbsolute()) {
107  stats.addCritical(NSLC,
108  "Basic transcription NSLink is absolute for "
109  + communicationName);
110  } else if (!relPath.endsWith(communicationName + ".exb")) {
111  stats.addCritical(NSLC,
112  "Wrong basic transcription NSLink for "
113  + communicationName);
114  } else {
115  stats.addCorrect(NSLC,
116  "Basic transcription NSLink OK: "
117  + communicationName);
118  }
119  String basTransName = basTrans.getChildText("Name");
120  if (!basTransName.equals(communicationName)) {
121  stats.addCritical(NSLC,
122  "Wrong basic transcription name for "
123  + communicationName, basTransName + " should be "
124  + communicationName);
125  } else {
126  stats.addCorrect(NSLC,
127  "Basic transcription name OK for "
128  + communicationName + ": " + basTransName);
129  }
130  if (!basTrans.getChildText("Filename").equals(
131  communicationName + ".exb")) {
132  stats.addCritical(NSLC,
133  "Wrong basic transcripton filename for "
134  + communicationName);
135  } else {
136  stats.addCorrect(NSLC,
137  "Correct Filename for basic transcription "
138  + communicationName);
139  }
140  }
141  XPath xpSegTrans = XPath.newInstance(
142  "Transcription[Description"
143  + "/Key[@Name='segmented']/text()='true']");
144  List allSegTrans = xpSegTrans.selectNodes(communication);
145  for (Object oS : allSegTrans) {
146  segTrans = (Element) oS;
147  String relPath = segTrans.getChildText("NSLink");
148  String filePath = comadirname + File.separator + relPath;
149  File file = new File(filePath);
150  if (!file.isFile()) {
151  stats.addCritical(NSLC,
152  "Segmented transcription file doesn't exist at"
153  + " NSLink for " + communicationName);
154  } else if (Paths.get("relPath").isAbsolute()) {
155  stats.addCritical(NSLC,
156  "Segmented transcription NSLink is absolute for "
157  + communicationName);
158  } else if (!relPath.endsWith(communicationName + "_s.exs")) {
159  stats.addCritical(NSLC,
160  "Wrong segmented transcription NSLink for "
161  + communicationName, relPath + " should end in "
162  + communicationName + "_s.exs");
163  } else {
164  stats.addCorrect(NSLC,
165  "Correct segmented transcription NSLink for "
166  + communicationName);
167  }
168  String segTransName = segTrans.getChildText("Name");
169  if (!segTransName.equals(communicationName)) {
170  stats.addCritical(NSLC,
171  "Wrong segmented transcription name for "
172  + communicationName, segTransName + " should be "
173  + communicationName);
174  } else if (!segTrans.getChildText("Filename").equals(
175  communicationName + "_s.exs")) {
176  stats.addCritical(NSLC,
177  "Wrong segmented transcription filename for "
178  + communicationName,
179  segTrans.getChildText("Filename")
180  + " should be " + communicationName + "_s.exs");
181  } else {
182  stats.addCorrect(NSLC,
183  "Correct Filename for segmented transcription "
184  + communicationName);
185  }
186  }
187  XPath xpRec = XPath.newInstance("Recording/Media");
188  List allRec = xpRec.selectNodes(communication);
189  for (Object oR : allRec) {
190  Element media = (Element) oR;
191  rec = media.getParentElement();
192  String relPath = media.getChildText("NSLink");
193  String filePath = comadirname + File.separator + relPath;
194  File file = new File(filePath);
195  if (!file.isFile()) {
196  stats.addCritical(NSLC,
197  "Recording file doesn't exist at NSLink for "
198  + communicationName);
199  } else if (Paths.get("relPath").isAbsolute()) {
200  stats.addCritical(NSLC,
201  "Recording NSLink is absolute for "
202  + communicationName);
203  } else if (!StringUtils.substringBefore(relPath, ".").endsWith(
204  communicationName)) {
205  stats.addCritical(NSLC,
206  "Wrong recording NSLink for "
207  + communicationName,
208  StringUtils.substringBefore(relPath, ".")
209  + " should end with " + communicationName);
210  } else {
211  stats.addCorrect(NSLC,
212  "Recording NSLink is correct for "
213  + communicationName);
214  }
215  String recName = rec.getChildText("Name");
216  if (!recName.equals(communicationName)) {
217  stats.addCritical(NSLC,
218  "Wrong recording name for " + communicationName,
219  recName + " should be " + communicationName);
220  }
221  }
222  }
223  return stats;
224  }
225 
336  SAXException, JDOMException, IOException, JexmaraldaException {
337  Map<String, String> obligatoryTiers = new HashMap<String, String>();
338  Map<String, String> optionalTiers = new HashMap<String, String>();
339  obligatoryTiers.put("ref", "Name of the communication");
340  optionalTiers.put("st", "Source texts: normally in Cyrillic "
341  + "transliteration");
342  obligatoryTiers.put("ts", "Transcription (what is heard)");
343  obligatoryTiers.put("tx", "Tier for interlinearization)");
344  obligatoryTiers.put("mb", "Morpheme break");
345  obligatoryTiers.put("mp", "Morphophonemes, underlying forms");
346  obligatoryTiers.put("gr", "Morphological annotation: Russian gloss of "
347  + "each morpheme");
348  obligatoryTiers.put("ge", "Morphological annotation: Egnlish gloss of "
349  + "each morpheme");
350  obligatoryTiers.put("mc", "Part of speech of each morpheme");
351  obligatoryTiers.put("ps", "Part of speech of each word");
352  obligatoryTiers.put("SeR", "Annotation of semantic roles");
353  obligatoryTiers.put("SyF", "Annotation of syntactic function");
354  optionalTiers.put("IST", "Annotation of information status");
355  optionalTiers.put("CW", "Annotation of code switching");
356  obligatoryTiers.put("fr", "Russian free translation");
357  optionalTiers.put("fe", "English free translation");
358  optionalTiers.put("fh", "Hungarian free translation");
359  optionalTiers.put("so", "Source origin");
360  optionalTiers.put("fg", "German free translation");
361  optionalTiers.put("nt", "Notes on the text unit");
362  Map<String, String> tierTypes = new HashMap<String, String>();
363  tierTypes.put("ref", "d");
364  tierTypes.put("st", "d");
365  tierTypes.put("ts", "d");
366  tierTypes.put("tx", "t");
367  tierTypes.put("mb", "a");
368  tierTypes.put("mp", "a");
369  tierTypes.put("gr", "a");
370  tierTypes.put("ge", "a");
371  tierTypes.put("mc", "a");
372  tierTypes.put("ps", "a");
373  tierTypes.put("SeR", "a");
374  tierTypes.put("SyF", "a");
375  tierTypes.put("IST", "a");
376  tierTypes.put("CW", "a");
377  tierTypes.put("fr", "d");
378  tierTypes.put("fe", "d");
379  tierTypes.put("fg", "d");
380  tierTypes.put("nt", "d");
381  tierTypes.put("so", "d");
382  tierTypes.put("fh", "d");
383 
384  Report stats = new Report();
385  Document nganasanCorpus
386  = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(comafilename);
387  XPath xpCommunications = XPath.newInstance("//Communication");
388  List allCommunications = xpCommunications.selectNodes(nganasanCorpus);
389  Set<String> skipTiers = new HashSet<String>();
390  skipTiers.add("COLUMN-LABEL");
391  skipTiers.add("ROW-LABEL");
392  skipTiers.add("SUB-ROW-LABEL");
393  skipTiers.add("EMPTY");
394  skipTiers.add("EMPTY-EDITOR");
395 
396  for (Object o : allCommunications) {
397  communication = (Element) o;
398  //retrieve the communication name
399  String communicationName = communication.getAttributeValue("Name");
400  //pick up basic transcriptions
401  XPath xpBasTrans = XPath.newInstance("Transcription[Description"
402  + "/Key[@Name='segmented']/text()='false']");
403  List allBasTrans = xpBasTrans.selectNodes(communication);
404  for (Object oB : allBasTrans) {
405  basTrans = (Element) oB;
406  String relPath = basTrans.getChildText("NSLink");
407  String filePath = comadirname + File.separator + relPath;
408  File file = new File(filePath);
409  if (!file.isFile()) {
410  // we already checked validity of files in other checks
411  continue;
412  }
413  Set<String> obligatoriesSeen = new HashSet<String>();
414  Set<String> optionalsSeen = new HashSet<String>();
415  Element desc = basTrans.getChild("Description");
416  BasicTranscription bt = new BasicTranscription(filePath);
417  BasicBody bb = bt.getBody();
418  String[] tierIDs = bb.getAllTierIDs();
419  for (String tierID : tierIDs) {
420  if (skipTiers.contains(tierID)) {
421  stats.addNote(NSLC,
422  "Skipped a tier: " + tierID,
423  "This tier does not need to be included in "
424  + "coma file");
425  continue;
426  }
427  Tier tier = null;
428  try {
429  tier = bb.getTierWithID(tierID);
430  } catch (JexmaraldaException je) {
431  stats.addException(je, "ERRORR: tier with ID " + tierID
432  + " is lost...");
433  exmaError.addError(NSLC, comadirname + relPath, tierID, "", false, "ERROR: tier with ID " + tierID
434  + " is lost...");
435  continue;
436  }
437  String displayName = tier.getDisplayName();
438  String category = tier.getCategory();
439  String tierType = tier.getType();
440  if (obligatoryTiers.containsKey(category)) {
441  obligatoriesSeen.add(category);
442  } else if (optionalTiers.containsKey(category)) {
443  optionalsSeen.add(category);
444  } else {
445  stats.addCritical(NSLC,
446  "Unrecognised tier name: "
447  + tierID);
448  exmaError.addError(NSLC, comadirname + relPath, tierID, "", false, "Unrecognised tier name: "
449  + tierID);
450 
451  }
452  if (tierTypes.containsKey(category)) {
453  if (!tierTypes.get(category).equals(tierType)) {
454  stats.addCritical(NSLC,
455  "Wrong tier type for: "
456  + tierID, "Switch to annotation or "
457  + " description tier");
458  exmaError.addError(NSLC, comadirname + relPath, tierID, "", false, "Wrong tier type for: "
459  + tierID);
460 
461  } else {
462  stats.addCorrect(NSLC,
463  "Correct tier type for: " + tierID);
464  }
465  } else {
466  stats.addWarning(NSLC,
467  "Not known if tier: "
468  + tierID + " should be annotation or "
469  + "description");
470  exmaError.addError(NSLC, comadirname + relPath, tierID, "", false, "Not known if tier: "
471  + tierID + " should be annotation or "
472  + "description");
473 
474  }
475  if (!category.equals(tierID)) {
476  stats.addCritical(NSLC,
477  "Tier ID should match category, "
478  + "but " + tierID + " is not " + category);
479  exmaError.addError(NSLC, comadirname + relPath, tierID, "", false, "Tier ID should match category, "
480  + "but " + tierID + " is not " + category);
481 
482  }
483  } // for each tier
484  for (Map.Entry<String, String> entry : obligatoryTiers.entrySet()) {
485  boolean found = false;
486  for (String seen : obligatoriesSeen) {
487  if (entry.getKey().equals(seen)) {
488  found = true;
489  }
490  }
491  if (!found) {
492  stats.addCritical(
493  "Missing required tier: "
494  + entry.getKey() + ": " + entry.getValue());
495  exmaError.addError(NSLC, comadirname + relPath, "", "", false, "Missing required tier: "
496  + entry.getKey() + ": " + entry.getValue());
497  }
498  }
499  } // for each transcirption
500  }
501  return stats;
502  }
503 
504  public static void main(String[] args) {
505  NgexmaraldaCorpusChecker checker;
506  try {
507  checker = new NgexmaraldaCorpusChecker();
508  checker.exceptionalCheck();
509  System.exit(0);
510  } catch (JDOMException ex) {
511  ex.printStackTrace();
512  } catch (IOException ex) {
513  ex.printStackTrace();
514  }
515  }
516 
522  public Report check(CorpusData cd) throws SAXException, JexmaraldaException {
523  Report stats = new Report();
524  try {
525  stats = exceptionalCheck(cd);
527  } catch (JexmaraldaException je) {
528  stats.addException(je, "Unknown parsing error");
529  } catch (JDOMException jdome) {
530  stats.addException(jdome, "Unknown parsing error");
531  } catch (SAXException saxe) {
532  stats.addException(saxe, "Unknown parsing error");
533  } catch (IOException ioe) {
534  stats.addException(ioe, "Reading/writing error");
535  } catch (ParserConfigurationException ex) {
536  Logger.getLogger(NgexmaraldaCorpusChecker.class.getName()).log(Level.SEVERE, null, ex);
537  }
538  return stats;
539  }
540 
546  throws SAXException, IOException, ParserConfigurationException, JexmaraldaException, JDOMException {
547  Report stats = new Report();
548  comafilename = cd.getURL().getFile();
549  comadirname = comafilename.substring(0, comafilename.lastIndexOf("/") + 1);
550  Document nganasanCorpus
551  = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(comafilename);
552  XPath xpCommunications = XPath.newInstance("//Communication");
553  List allCommunications = xpCommunications.selectNodes(nganasanCorpus);
554  for (Object o : allCommunications) {
555  communication = (Element) o;
556  //retrieve the communication name
557  String communicationName = communication.getAttributeValue("Name");
558  //pick up basic transcriptions
559  XPath xpBasTrans = XPath.newInstance("Transcription[Description"
560  + "/Key[@Name='segmented']/text()='false']");
561  List allBasTrans = xpBasTrans.selectNodes(communication);
562  for (Object oB : allBasTrans) {
563  basTrans = (Element) oB;
564  String relPath = basTrans.getChildText("NSLink");
565  String filePath = comadirname + File.separator + relPath;
566  File file = new File(filePath);
567  if (!file.isFile()) {
568  stats.addCritical(NSLC,
569  "Basic transcription file doesn't exist at "
570  + "NSLink for " + communicationName);
571  } else if (Paths.get(relPath).isAbsolute()) {
572  stats.addCritical(NSLC,
573  "Basic transcription NSLink is absolute for "
574  + communicationName);
575  } else if (!relPath.endsWith(communicationName + ".exb")) {
576  stats.addCritical(NSLC,
577  "Wrong basic transcription NSLink for "
578  + communicationName);
579  } else {
580  stats.addCorrect(NSLC,
581  "Basic transcription NSLink OK: "
582  + communicationName);
583  }
584  String basTransName = basTrans.getChildText("Name");
585  if (!basTransName.equals(communicationName)) {
586  stats.addCritical(NSLC,
587  "Wrong basic transcription name for "
588  + communicationName, basTransName + " should be "
589  + communicationName);
590  } else {
591  stats.addCorrect(NSLC,
592  "Basic transcription name OK for "
593  + communicationName + ": " + basTransName);
594  }
595  if (!basTrans.getChildText("Filename").equals(
596  communicationName + ".exb")) {
597  stats.addCritical(NSLC,
598  "Wrong basic transcripton filename for "
599  + communicationName);
600  } else {
601  stats.addCorrect(NSLC,
602  "Correct Filename for basic transcription "
603  + communicationName);
604  }
605  }
606  XPath xpSegTrans = XPath.newInstance(
607  "Transcription[Description"
608  + "/Key[@Name='segmented']/text()='true']");
609  List allSegTrans = xpSegTrans.selectNodes(communication);
610  for (Object oS : allSegTrans) {
611  segTrans = (Element) oS;
612  String relPath = segTrans.getChildText("NSLink");
613  String filePath = comadirname + File.separator + relPath;
614  File file = new File(filePath);
615  if (!file.isFile()) {
616  stats.addCritical(NSLC,
617  "Segmented transcription file doesn't exist at"
618  + " NSLink for " + communicationName);
619  } else if (Paths.get("relPath").isAbsolute()) {
620  stats.addCritical(NSLC,
621  "Segmented transcription NSLink is absolute for "
622  + communicationName);
623  } else if (!relPath.endsWith(communicationName + "_s.exs")) {
624  stats.addCritical(NSLC,
625  "Wrong segmented transcription NSLink for "
626  + communicationName, relPath + " should end in "
627  + communicationName + "_s.exs");
628  } else {
629  stats.addCorrect(NSLC,
630  "Correct segmented transcription NSLink for "
631  + communicationName);
632  }
633  String segTransName = segTrans.getChildText("Name");
634  if (!segTransName.equals(communicationName)) {
635  stats.addCritical(NSLC,
636  "Wrong segmented transcription name for "
637  + communicationName, segTransName + " should be "
638  + communicationName);
639  } else if (!segTrans.getChildText("Filename").equals(
640  communicationName + "_s.exs")) {
641  stats.addCritical(NSLC,
642  "Wrong segmented transcription filename for "
643  + communicationName,
644  segTrans.getChildText("Filename")
645  + " should be " + communicationName + "_s.exs");
646  } else {
647  stats.addCorrect(NSLC,
648  "Correct Filename for segmented transcription "
649  + communicationName);
650  }
651  }
652  XPath xpRec = XPath.newInstance("Recording/Media");
653  List allRec = xpRec.selectNodes(communication);
654  for (Object oR : allRec) {
655  Element media = (Element) oR;
656  rec = media.getParentElement();
657  String relPath = media.getChildText("NSLink");
658  String filePath = comadirname + File.separator + relPath;
659  File file = new File(filePath);
660  if (!file.isFile()) {
661  stats.addCritical(NSLC,
662  "Recording file doesn't exist at NSLink for "
663  + communicationName);
664  } else if (Paths.get("relPath").isAbsolute()) {
665  stats.addCritical(NSLC,
666  "Recording NSLink is absolute for "
667  + communicationName);
668  } else if (!StringUtils.substringBefore(relPath, ".").endsWith(
669  communicationName)) {
670  stats.addCritical(NSLC,
671  "Wrong recording NSLink for "
672  + communicationName,
673  StringUtils.substringBefore(relPath, ".")
674  + " should end with " + communicationName);
675  } else {
676  stats.addCorrect(NSLC,
677  "Recording NSLink is correct for "
678  + communicationName);
679  }
680  String recName = rec.getChildText("Name");
681  if (!recName.equals(communicationName)) {
682  stats.addCritical(NSLC,
683  "Wrong recording name for " + communicationName,
684  recName + " should be " + communicationName);
685  }
686  }
687  }
688  return stats;
689  }
690 
696  @Override
697  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
698  try {
699  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
700  IsUsableFor.add(cl);
701  } catch (ClassNotFoundException ex) {
702  Logger.getLogger(NgexmaraldaCorpusChecker.class.getName()).log(Level.SEVERE, null, ex);
703  }
704  return IsUsableFor;
705  }
706 
711  @Override
712  public String getDescription() {
713  String description = "This class is the check procedure for the Nganasan"
714  + " Corpus and checks if the file names in the corpus comply with"
715  + " the coma file.";
716  return description;
717  }
718 
719  @Override
720  public Report function(CorpusData cd, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
721  throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
722  }
723 
724  @Override
725  public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
726  throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
727  }
728 
729 
730 }
void addNote(String statId, String description)
Definition: Report.java:245
void merge(Report sr)
Definition: Report.java:73
void addCritical(String description)
Definition: Report.java:104
void addWarning(String statId, String description)
Definition: Report.java:164
void addCorrect(String statId, String description)
Definition: Report.java:217
void addException(Throwable e, String description)
Definition: Report.java:287