corpus-services  1.0
ExbRefTierChecker.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora.validation;
2 
11 import java.io.IOException;
12 import java.net.URISyntaxException;
13 import java.util.ArrayList;
14 import java.util.Collection;
15 import javax.xml.parsers.ParserConfigurationException;
16 import javax.xml.transform.TransformerException;
17 import javax.xml.xpath.XPathExpressionException;
18 import org.jdom.JDOMException;
19 import org.w3c.dom.Document;
20 import org.w3c.dom.Element;
21 import org.w3c.dom.NodeList;
22 import org.xml.sax.SAXException;
23 
29 public class ExbRefTierChecker extends Checker implements CorpusFunction {
30 
31  String tierLoc = "";
32 
33  public ExbRefTierChecker() {
34  super(true);
35  }
36 
42  @Override
43  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
44  try {
45  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
46  IsUsableFor.add(cl);
47  } catch (ClassNotFoundException ex) {
48  report.addException(ex, " usable class not found");
49  }
50  return IsUsableFor;
51  }
52 
53  //testRefIds
54  @Override
55  public Report function(CorpusData cd, Boolean fix) throws IOException, SAXException, TransformerException, ParserConfigurationException, XPathExpressionException {
56  Report stats = new Report(); // create a new report for the transcript
57  Document doc = null;
59  bcd = (BasicTranscriptionData) cd;
60  doc = TypeConverter.JdomDocument2W3cDocument(bcd.getJdom()); // get the file as a document
61  String transcriptName;
62  if (doc.getElementsByTagName("transcription-name").getLength() > 0) { // check if transcript name exists for the exb file
63  transcriptName = doc.getElementsByTagName("transcription-name").item(0).getTextContent(); // get transcript name
64  } else {
65  transcriptName = "Nameless Transcript";
66  }
67 
68  NodeList tiers = doc.getElementsByTagName("tier"); // get all tiers of the transcript
69  ArrayList<Element> refTiers = new ArrayList();
70  ArrayList<String> speakerNames = new ArrayList();
71  for (int i = 0; i < tiers.getLength(); i++) { // loop for dealing with each tier
72  Element tier = (Element) tiers.item(i);
73  String category = tier.getAttribute("category"); // get category
74  String speakerName = tier.getAttribute("speaker"); // get speaker name
75  if (category.equals("ref")) {
76  refTiers.add(tier);
77  speakerNames.add(speakerName);
78  }
79  }
80 
81  // when there is no reference tier present
82  if (refTiers.size() == 0) {
83  String message = "There is no reference tier present in transcript " + transcriptName;
84  stats.addWarning(function, cd, message);
85  exmaError.addError(function, cd.getURL().getFile(), "", "", false, message);
86  } // when there are reference tier/s present
87  else {
88 
89  // iterate ref tiers
90  for (int i = 0; i < refTiers.size(); i++) {
91  NodeList events = refTiers.get(i).getElementsByTagName("event");
92  String tierId = refTiers.get(i).getAttribute("id");
93  String tierSpeaker = refTiers.get(i).getAttribute("speaker");
94  int order = 1;
95 
96  // iterate ref events
97  for (int j = 0; j < events.getLength(); j++) {
98  Element event = (Element) events.item(j);
99  String eventStart = event.getAttribute("start");
100  String eventEnd = event.getAttribute("end");
101  String wholeRef = event.getTextContent();
102  String eventReference = "event " + eventStart + "/" + eventEnd + ", tier '" + tierId + "', EXB '" + transcriptName + "'";
103 
104  //if (wholeRef.contains("(") && wholeRef.contains(".")) {
105  if (wholeRef.contains(".")) {
106 
107  // get position of character after number that shall be tested/updated
108  int end = wholeRef.length();
109  if (wholeRef.contains("(")) {
110  end = wholeRef.indexOf("(") - 1;
111  }
112 
113  // get position of first character that belongs to number in question
114  int start = wholeRef.substring(0, end).lastIndexOf(".") + 1;
115 
116  // get the number in question
117  String no = wholeRef.substring(start, end);
118  int numbering = Integer.parseInt(no);
119 
120  // test for correct numbering
121  if (order != numbering) {
122 
123  // if to be fixed
124  if (fix) {
125  String correctNo = String.format("%0" + no.length() + "d", order);
126  String correctRef = wholeRef.substring(0, start) + correctNo + wholeRef.substring(end);
127  event.setTextContent(correctRef);
128 
129  String message = "Fixed: False numbering in ref ID '" + wholeRef + "' to '" + correctNo + "' (" + eventReference + ")";
130  stats.addFix(function, cd, message);
131  } // if only to be tested
132  else {
133  String message = "False numbering in ref ID '" + wholeRef + "' (" + eventReference + ")";
134  stats.addCritical(function, cd, message);
135  exmaError.addError(function, cd.getURL().getFile(), tierId, eventStart, false, message);
136  }
137 
138  }
139  order++;
140 
141  // if there is more than one ref tier then also test speaker codes
142  if (refTiers.size() > 1) {
143  int refEnd = start - 1;
144  int refStart = -1;
145  String speakerCode = null;
146  if (wholeRef.substring(0, refEnd).contains(".")) {
147  refStart = wholeRef.substring(0, refEnd).lastIndexOf(".") + 1;
148  speakerCode = wholeRef.substring(refStart, refEnd);
149  }
150 
151  if (speakerCode != null) {
152  if (!speakerCode.equals(tierSpeaker)) {
153 
154  // if to be fixed
155  if (fix) {
156  String correctRef = event.getTextContent().substring(0, refStart) + tierSpeaker + event.getTextContent().substring(refEnd);
157  event.setTextContent(correctRef);
158 
159  String message = "Fixed: False speaker code in ref ID '" + wholeRef + "' to '" + tierSpeaker + "' (" + eventReference + ")";
160  stats.addFix(function, cd, message);
161  } // if only to be tested
162  else {
163  String message = "False speaker code in ref ID '" + wholeRef + "' (should be '" + tierSpeaker + "' in " + eventReference + ")";
164  stats.addCritical(function, cd, message);
165  exmaError.addError(function, cd.getURL().getFile(), tierId, eventStart, false, message);
166  }
167 
168  }
169  } else // if to be fixed
170  if (fix) {
171  String correctRef = event.getTextContent().substring(0, start - 1) + "." + tierSpeaker + event.getTextContent().substring(refEnd);
172  event.setTextContent(correctRef);
173 
174  String message = "Fixed: Missing speaker code in ref ID '" + wholeRef + "' to '" + tierSpeaker + "' (" + eventReference + ")";
175  stats.addFix(function, cd, message);
176  } // if only to be tested
177  else {
178  String message = "Missing speaker code in ref ID '" + wholeRef + "' (should contain '" + tierSpeaker + "' in " + eventReference + ")";
179  stats.addCritical(function, cd, message);
180  exmaError.addError(function, cd.getURL().getFile(), tierId, eventStart, false, message);
181  }
182  }
183 
184  } // ref ID does not contain any "."
185  else {
186  String message = "Unknown format of ref ID '" + wholeRef + "' in " + transcriptName;
187  stats.addCritical(function, cd, message);
188  exmaError.addError(function, cd.getURL().getFile(), tierId, eventStart, false, message);
189  }
190  }
191  }
192  }
193 
194  String result = TypeConverter.W3cDocument2String(doc);
195  CorpusIO cio = new CorpusIO();
196  if (fix) {
197  cd.updateUnformattedString(result);
198  cio.write(cd, cd.getURL());
199  }
200 
201  return stats; // return all the warnings
202  }
203 
208  @Override
209  public String getDescription() {
210  String description = "This class checks reference tiers in exb files and"
211  + " finds out whether or not the order of the numbering and speaker"
212  + " reference are correct and if there are any mistakes in the ref"
213  + " tiers, it corrects them thanks to its fix function.";
214  return description;
215  }
216 
217  @Override
218  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
219  Report stats = new Report();
220  for (CorpusData cdata : c.getBasicTranscriptionData()) {
221  stats.merge(function(cdata, fix));
222  }
223  return stats;
224  }
225 }
void merge(Report sr)
Definition: Report.java:73
void addCritical(String description)
Definition: Report.java:104
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
Definition: Report.java:164
static String W3cDocument2String(org.w3c.dom.Document doc)
void addException(Throwable e, String description)
Definition: Report.java:287
Collection< Class<?extends CorpusData > > getIsUsableFor()
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155