1 package de.uni_hamburg.corpora.validation;
11 import java.io.IOException;
12 import java.net.URISyntaxException;
13 import java.util.Collection;
14 import javax.xml.parsers.ParserConfigurationException;
15 import javax.xml.transform.TransformerException;
16 import javax.xml.xpath.XPathExpressionException;
17 import org.w3c.dom.Document;
18 import org.w3c.dom.Element;
19 import org.w3c.dom.NodeList;
20 import org.xml.sax.SAXException;
41 throws SAXException, IOException, ParserConfigurationException, URISyntaxException, TransformerException, XPathExpressionException {
46 NodeList communications = doc.getElementsByTagName(
"Communication");
48 for (
int i = 0; i < communications.getLength(); i++) {
49 Element communication = (Element) communications.item(i);
50 NodeList transcriptions = communication.getElementsByTagName(
"Transcription");
51 String communicationID = communication.getAttribute(
"Id");
52 String communicationName = communication.getAttribute(
"Name");
54 String basicTranscriptName =
"";
55 String basicFileName =
"";
56 String basicNSLink =
"";
57 String segmentedTranscriptName =
"";
58 String segmentedFileName =
"";
59 String segmentedNSLink =
"";
60 String transcriptName =
"";
63 if (transcriptions.getLength() > 0) {
64 for (
int j = 0; j < transcriptions.getLength(); j++) {
65 Element transcription = (Element) transcriptions.item(j);
68 transcriptName = transcription.getElementsByTagName(
"Name").item(0).getTextContent();
69 fileName = transcription.getElementsByTagName(
"Filename").item(0).getTextContent();
70 String baseFileName = fileName.replaceAll(
"(\\.exb|(_s)?\\.exs)$",
"");
72 if (!transcriptName.equals(baseFileName)) {
75 transcription.getElementsByTagName(
"Name").item(0).setTextContent(baseFileName);
76 stats.
addFix(
function, cd,
"Transcription/Name (" + transcriptName +
") changed to base file name (" + baseFileName +
").");
79 String message =
"No transcription found for communication " + communicationName +
", id: " + communicationID +
".";
80 System.out.println(message);
93 cio.
write(cd, cd.getURL());
96 NodeList keys = transcription.getElementsByTagName(
"Key");
97 boolean segmented =
false;
98 for (
int k = 0; k < keys.getLength(); k++) {
99 Element key = (Element) keys.item(k);
100 if (key.getAttribute(
"Name").equals(
"segmented")) {
101 String seg = key.getTextContent();
102 if (seg.equals(
"true"))
110 basicTranscriptName = transcription.getElementsByTagName(
"Name").item(0).getTextContent();
111 basicFileName = transcription.getElementsByTagName(
"Filename").item(0).getTextContent();
112 basicNSLink = transcription.getElementsByTagName(
"NSLink").item(0).getTextContent();
114 segmentedTranscriptName = transcription.getElementsByTagName(
"Name").item(0).getTextContent();
115 segmentedFileName = transcription.getElementsByTagName(
"Filename").item(0).getTextContent();
116 segmentedNSLink = transcription.getElementsByTagName(
"NSLink").item(0).getTextContent();
120 if (!basicTranscriptName.isEmpty() && !segmentedTranscriptName.isEmpty()) {
121 if (!basicTranscriptName.equals(segmentedTranscriptName)) {
123 System.out.println(
"Basic transcription name and segmented transcription name do not match " 124 +
"for communication " + communicationName +
", id: " + communicationID +
".");
125 stats.
addCritical(
function, cd,
"Transcript name mismatch exb: " + basicTranscriptName +
" exs: " + segmentedTranscriptName
126 +
" for communication " + communicationName +
".");
128 stats.
addCorrect(
function, cd,
"Transcript name matches exb: " + basicTranscriptName +
" exs: " + segmentedTranscriptName
129 +
" for communication " + communicationName +
".");
132 if (!basicFileName.isEmpty() && !segmentedFileName.isEmpty()) {
134 if (!basicFileName.substring(0, basicFileName.lastIndexOf(
".")).equals(segmentedFileName.substring(0, segmentedFileName.lastIndexOf(
"_")))) {
135 System.out.println(
"Basic file name and segmented file name do not match " 136 +
"for communication " + communicationName +
", id: " + communicationID +
".");
137 stats.
addCritical(
function, cd,
"Basic file name mismatch exb: " + basicFileName.substring(0, basicFileName.lastIndexOf(
".")) +
" exs: " + segmentedFileName.substring(0, segmentedFileName.lastIndexOf(
"_"))
138 +
" for communication " + communicationName +
".");
140 stats.
addCorrect(
function, cd,
"Basic file name matches exb: " + basicFileName.substring(0, basicFileName.lastIndexOf(
".")) +
" exs: " + segmentedFileName.substring(0, segmentedFileName.lastIndexOf(
"_"))
141 +
" for communication " + communicationName +
".");
144 if (!basicNSLink.isEmpty() && !segmentedNSLink.isEmpty()) {
146 if (!basicNSLink.substring(0, basicNSLink.lastIndexOf(
".")).equals(segmentedNSLink.substring(0, segmentedNSLink.lastIndexOf(
"_")))) {
147 System.out.println(
"Basic NSLink and segmented NSLink do not match " 148 +
"for communication " + communicationName +
", id: " + communicationID +
".");
149 stats.
addCritical(
function, cd,
"NSLink filename mismatch exb: " + basicNSLink.substring(0, basicNSLink.lastIndexOf(
".")) +
" exs: " + segmentedNSLink.substring(0, segmentedNSLink.lastIndexOf(
"_"))
150 +
" for communication " + communicationName +
".");
152 stats.
addCorrect(
function, cd,
"NSLink filename matches exb: " + basicNSLink.substring(0, basicNSLink.lastIndexOf(
".")) +
" exs: " + segmentedNSLink.substring(0, segmentedNSLink.lastIndexOf(
"_"))
153 +
" for communication " + communicationName +
".");
158 System.out.println(
"No transcriptions found " 159 +
"for communication " + communicationName +
", id: " + communicationID +
".");
160 stats.
addCorrect(
function, cd,
"No transcript found to be compared " 161 +
"for communication " + communicationName +
".");
175 Class cl = Class.forName(
"de.uni_hamburg.corpora.ComaData");
177 }
catch (ClassNotFoundException ex) {
178 report.
addException(ex,
"unknown class not found error");
189 String description =
"This class checks whether or not there is a mismatch " 190 +
"between basic and segmented names, basic and segmented file names, " 191 +
"plus their NSLinks for each communication in the coma file.";
196 public Report function(
Corpus c, Boolean fix)
throws SAXException, IOException, ParserConfigurationException, URISyntaxException, TransformerException, XPathExpressionException {
198 cd = c.getComaData();
199 stats =
function(cd, fix);
Collection< Class<?extends CorpusData > > getIsUsableFor()
static org.jdom.Document W3cDocument2JdomDocument(org.w3c.dom.Document input)
void addCritical(String description)
ComaTranscriptionsNameChecker()
void setJdom(Document jdom)
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addCorrect(String statId, String description)
static String W3cDocument2String(org.w3c.dom.Document doc)
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
void write(CorpusData cd, URL url)
void addFix(String statId, CorpusData cd, String description)