1 package de.uni_hamburg.corpora.validation;
11 import java.io.IOException;
12 import java.net.URISyntaxException;
13 import java.util.ArrayList;
14 import java.util.Collection;
15 import javax.xml.parsers.ParserConfigurationException;
16 import javax.xml.transform.TransformerException;
17 import javax.xml.xpath.XPathExpressionException;
18 import org.jdom.JDOMException;
19 import org.w3c.dom.Document;
20 import org.w3c.dom.Element;
21 import org.w3c.dom.NodeList;
22 import org.xml.sax.SAXException;
45 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
47 }
catch (ClassNotFoundException ex) {
55 public Report function(
CorpusData cd, Boolean fix)
throws IOException, SAXException, TransformerException, ParserConfigurationException, XPathExpressionException {
61 String transcriptName;
62 if (doc.getElementsByTagName(
"transcription-name").getLength() > 0) {
63 transcriptName = doc.getElementsByTagName(
"transcription-name").item(0).getTextContent();
65 transcriptName =
"Nameless Transcript";
68 NodeList tiers = doc.getElementsByTagName(
"tier");
69 ArrayList<Element> refTiers =
new ArrayList();
70 ArrayList<String> speakerNames =
new ArrayList();
71 for (
int i = 0; i < tiers.getLength(); i++) {
72 Element tier = (Element) tiers.item(i);
73 String category = tier.getAttribute(
"category");
74 String speakerName = tier.getAttribute(
"speaker");
75 if (category.equals(
"ref")) {
77 speakerNames.add(speakerName);
82 if (refTiers.size() == 0) {
83 String message =
"There is no reference tier present in transcript " + transcriptName;
85 exmaError.addError(
function, cd.getURL().getFile(),
"",
"",
false, message);
90 for (
int i = 0; i < refTiers.size(); i++) {
91 NodeList events = refTiers.get(i).getElementsByTagName(
"event");
92 String tierId = refTiers.get(i).getAttribute(
"id");
93 String tierSpeaker = refTiers.get(i).getAttribute(
"speaker");
97 for (
int j = 0; j < events.getLength(); j++) {
98 Element
event = (Element) events.item(j);
99 String eventStart =
event.getAttribute(
"start");
100 String eventEnd =
event.getAttribute(
"end");
101 String wholeRef =
event.getTextContent();
102 String eventReference =
"event " + eventStart +
"/" + eventEnd +
", tier '" + tierId +
"', EXB '" + transcriptName +
"'";
105 if (wholeRef.contains(
".")) {
108 int end = wholeRef.length();
109 if (wholeRef.contains(
"(")) {
110 end = wholeRef.indexOf(
"(") - 1;
114 int start = wholeRef.substring(0, end).lastIndexOf(
".") + 1;
117 String no = wholeRef.substring(start, end);
118 int numbering = Integer.parseInt(no);
121 if (order != numbering) {
125 String correctNo = String.format(
"%0" + no.length() +
"d", order);
126 String correctRef = wholeRef.substring(0, start) + correctNo + wholeRef.substring(end);
127 event.setTextContent(correctRef);
129 String message =
"Fixed: False numbering in ref ID '" + wholeRef +
"' to '" + correctNo +
"' (" + eventReference +
")";
130 stats.
addFix(
function, cd, message);
133 String message =
"False numbering in ref ID '" + wholeRef +
"' (" + eventReference +
")";
135 exmaError.addError(
function, cd.getURL().getFile(), tierId, eventStart,
false, message);
142 if (refTiers.size() > 1) {
143 int refEnd = start - 1;
145 String speakerCode = null;
146 if (wholeRef.substring(0, refEnd).contains(
".")) {
147 refStart = wholeRef.substring(0, refEnd).lastIndexOf(
".") + 1;
148 speakerCode = wholeRef.substring(refStart, refEnd);
151 if (speakerCode != null) {
152 if (!speakerCode.equals(tierSpeaker)) {
156 String correctRef =
event.getTextContent().substring(0, refStart) + tierSpeaker +
event.getTextContent().substring(refEnd);
157 event.setTextContent(correctRef);
159 String message =
"Fixed: False speaker code in ref ID '" + wholeRef +
"' to '" + tierSpeaker +
"' (" + eventReference +
")";
160 stats.
addFix(
function, cd, message);
163 String message =
"False speaker code in ref ID '" + wholeRef +
"' (should be '" + tierSpeaker +
"' in " + eventReference +
")";
165 exmaError.addError(
function, cd.getURL().getFile(), tierId, eventStart,
false, message);
171 String correctRef =
event.getTextContent().substring(0, start - 1) +
"." + tierSpeaker +
event.getTextContent().substring(refEnd);
172 event.setTextContent(correctRef);
174 String message =
"Fixed: Missing speaker code in ref ID '" + wholeRef +
"' to '" + tierSpeaker +
"' (" + eventReference +
")";
175 stats.
addFix(
function, cd, message);
178 String message =
"Missing speaker code in ref ID '" + wholeRef +
"' (should contain '" + tierSpeaker +
"' in " + eventReference +
")";
180 exmaError.addError(
function, cd.getURL().getFile(), tierId, eventStart,
false, message);
186 String message =
"Unknown format of ref ID '" + wholeRef +
"' in " + transcriptName;
188 exmaError.addError(
function, cd.getURL().getFile(), tierId, eventStart,
false, message);
197 cd.updateUnformattedString(result);
198 cio.
write(cd, cd.getURL());
210 String description =
"This class checks reference tiers in exb files and" 211 +
" finds out whether or not the order of the numbering and speaker" 212 +
" reference are correct and if there are any mistakes in the ref" 213 +
" tiers, it corrects them thanks to its fix function.";
218 public Report function(
Corpus c, Boolean fix)
throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
220 for (
CorpusData cdata : c.getBasicTranscriptionData()) {
221 stats.
merge(
function(cdata, fix));
static ExmaErrorList exmaError
void addCritical(String description)
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
static String W3cDocument2String(org.w3c.dom.Document doc)
void addException(Throwable e, String description)
Collection< Class<?extends CorpusData > > getIsUsableFor()
void write(CorpusData cd, URL url)
void addFix(String statId, CorpusData cd, String description)