corpus-services  1.0
ComaTranscriptionsNameChecker.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora.validation;
2 
11 import java.io.IOException;
12 import java.net.URISyntaxException;
13 import java.util.Collection;
14 import javax.xml.parsers.ParserConfigurationException;
15 import javax.xml.transform.TransformerException;
16 import javax.xml.xpath.XPathExpressionException;
17 import org.w3c.dom.Document;
18 import org.w3c.dom.Element;
19 import org.w3c.dom.NodeList;
20 import org.xml.sax.SAXException;
21 
27 public class ComaTranscriptionsNameChecker extends Checker implements CorpusFunction {
28 
30  //can fix
31  super(true);
32  }
33 
40  public Report function(CorpusData cd, Boolean fix)
41  throws SAXException, IOException, ParserConfigurationException, URISyntaxException, TransformerException, XPathExpressionException {
42  Document doc = null;
43  ComaData ccd = new ComaData();
44  ccd = (ComaData) cd;
45  doc = TypeConverter.JdomDocument2W3cDocument(ccd.getJdom()); // get the file as a document
46  NodeList communications = doc.getElementsByTagName("Communication"); // divide by Communication tags
47  Report stats = new Report(); //create a new report
48  for (int i = 0; i < communications.getLength(); i++) { //iterate through communications
49  Element communication = (Element) communications.item(i);
50  NodeList transcriptions = communication.getElementsByTagName("Transcription"); // get transcriptions of current communication
51  String communicationID = communication.getAttribute("Id"); // get communication id to use it in the warning
52  String communicationName = communication.getAttribute("Name"); // get communication name to use it in the warning
53 
54  String basicTranscriptName = "";
55  String basicFileName = "";
56  String basicNSLink = "";
57  String segmentedTranscriptName = "";
58  String segmentedFileName = "";
59  String segmentedNSLink = "";
60  String transcriptName = "";
61  String fileName = "";
62 
63  if (transcriptions.getLength() > 0) { // check if there is at least one transcription for the communication
64  for (int j = 0; j < transcriptions.getLength(); j++) { // iterate through transcriptions
65  Element transcription = (Element) transcriptions.item(j);
66  if (fix) {
67 
68  transcriptName = transcription.getElementsByTagName("Name").item(0).getTextContent();
69  fileName = transcription.getElementsByTagName("Filename").item(0).getTextContent();
70  String baseFileName = fileName.replaceAll("(\\.exb|(_s)?\\.exs)$", "");
71 
72  if (!transcriptName.equals(baseFileName)) {
73 
74  // fix the transcription Name
75  transcription.getElementsByTagName("Name").item(0).setTextContent(baseFileName);
76  stats.addFix(function, cd, "Transcription/Name (" + transcriptName + ") changed to base file name (" + baseFileName + ").");
77 
78  } else {
79  String message = "No transcription found for communication " + communicationName + ", id: " + communicationID + ".";
80  System.out.println(message);
81  stats.addCorrect(function, cd, message);
82  }
83 
84  //then save file
85  CorpusIO cio = new CorpusIO();
86  cd.updateUnformattedString(TypeConverter.W3cDocument2String(doc));
87  XMLData xml = (XMLData) cd;
88  org.jdom.Document jdomDoc = TypeConverter.W3cDocument2JdomDocument(doc);
89  xml.setJdom(jdomDoc);
90  cd = (CorpusData) xml;
91 
92  cd.updateUnformattedString(TypeConverter.JdomDocument2String(jdomDoc));
93  cio.write(cd, cd.getURL());
94 
95  } else {
96  NodeList keys = transcription.getElementsByTagName("Key"); // get keys of current transcription
97  boolean segmented = false; // flag for distinguishing basic file from segmented file
98  for (int k = 0; k < keys.getLength(); k++) { // look for the key with "segmented" attribute
99  Element key = (Element) keys.item(k);
100  if (key.getAttribute("Name").equals("segmented")) {
101  String seg = key.getTextContent();
102  if (seg.equals("true")) // check if transcription is segmented or not
103  {
104  segmented = true; // if segmented transcription then turn the flag true
105  }
106  break;
107  }
108  }
109  if (!segmented) { // get name, file name and nslink of basic transcription
110  basicTranscriptName = transcription.getElementsByTagName("Name").item(0).getTextContent();
111  basicFileName = transcription.getElementsByTagName("Filename").item(0).getTextContent();
112  basicNSLink = transcription.getElementsByTagName("NSLink").item(0).getTextContent();
113  } else { // get name, file name and nslink of segmented transcription
114  segmentedTranscriptName = transcription.getElementsByTagName("Name").item(0).getTextContent();
115  segmentedFileName = transcription.getElementsByTagName("Filename").item(0).getTextContent();
116  segmentedNSLink = transcription.getElementsByTagName("NSLink").item(0).getTextContent();
117  }
118  }
119 
120  if (!basicTranscriptName.isEmpty() && !segmentedTranscriptName.isEmpty()) {
121  if (!basicTranscriptName.equals(segmentedTranscriptName)) { // check for mismatch between names
122  // issue a warning if necessary
123  System.out.println("Basic transcription name and segmented transcription name do not match "
124  + "for communication " + communicationName + ", id: " + communicationID + ".");
125  stats.addCritical(function, cd, "Transcript name mismatch exb: " + basicTranscriptName + " exs: " + segmentedTranscriptName
126  + " for communication " + communicationName + ".");
127  } else {
128  stats.addCorrect(function, cd, "Transcript name matches exb: " + basicTranscriptName + " exs: " + segmentedTranscriptName
129  + " for communication " + communicationName + ".");
130  }
131  }
132  if (!basicFileName.isEmpty() && !segmentedFileName.isEmpty()) {
133  // check for mismatch between file names, issue a warning if necessary
134  if (!basicFileName.substring(0, basicFileName.lastIndexOf(".")).equals(segmentedFileName.substring(0, segmentedFileName.lastIndexOf("_")))) {
135  System.out.println("Basic file name and segmented file name do not match "
136  + "for communication " + communicationName + ", id: " + communicationID + ".");
137  stats.addCritical(function, cd, "Basic file name mismatch exb: " + basicFileName.substring(0, basicFileName.lastIndexOf(".")) + " exs: " + segmentedFileName.substring(0, segmentedFileName.lastIndexOf("_"))
138  + " for communication " + communicationName + ".");
139  } else {
140  stats.addCorrect(function, cd, "Basic file name matches exb: " + basicFileName.substring(0, basicFileName.lastIndexOf(".")) + " exs: " + segmentedFileName.substring(0, segmentedFileName.lastIndexOf("_"))
141  + " for communication " + communicationName + ".");
142  }
143  }
144  if (!basicNSLink.isEmpty() && !segmentedNSLink.isEmpty()) {
145  // check for mismatch between nslinks, issue a warning if necessary
146  if (!basicNSLink.substring(0, basicNSLink.lastIndexOf(".")).equals(segmentedNSLink.substring(0, segmentedNSLink.lastIndexOf("_")))) {
147  System.out.println("Basic NSLink and segmented NSLink do not match "
148  + "for communication " + communicationName + ", id: " + communicationID + ".");
149  stats.addCritical(function, cd, "NSLink filename mismatch exb: " + basicNSLink.substring(0, basicNSLink.lastIndexOf(".")) + " exs: " + segmentedNSLink.substring(0, segmentedNSLink.lastIndexOf("_"))
150  + " for communication " + communicationName + ".");
151  } else {
152  stats.addCorrect(function, cd, "NSLink filename matches exb: " + basicNSLink.substring(0, basicNSLink.lastIndexOf(".")) + " exs: " + segmentedNSLink.substring(0, segmentedNSLink.lastIndexOf("_"))
153  + " for communication " + communicationName + ".");
154  }
155  }
156  }
157  } else {
158  System.out.println("No transcriptions found "
159  + "for communication " + communicationName + ", id: " + communicationID + ".");
160  stats.addCorrect(function, cd, "No transcript found to be compared "
161  + "for communication " + communicationName + ".");
162  }
163  }
164  return stats; // return the report with warnings
165  }
166 
172  @Override
173  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
174  try {
175  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
176  IsUsableFor.add(cl);
177  } catch (ClassNotFoundException ex) {
178  report.addException(ex, "unknown class not found error");
179  }
180  return IsUsableFor;
181  }
182 
187  @Override
188  public String getDescription() {
189  String description = "This class checks whether or not there is a mismatch "
190  + "between basic and segmented names, basic and segmented file names, "
191  + "plus their NSLinks for each communication in the coma file.";
192  return description;
193  }
194 
195  @Override
196  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, TransformerException, XPathExpressionException {
197  Report stats;
198  cd = c.getComaData();
199  stats = function(cd, fix);
200  return stats;
201  }
202 
203 }
static org.jdom.Document W3cDocument2JdomDocument(org.w3c.dom.Document input)
void addCritical(String description)
Definition: Report.java:104
void setJdom(Document jdom)
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addCorrect(String statId, String description)
Definition: Report.java:217
static String W3cDocument2String(org.w3c.dom.Document doc)
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155