10 package de.uni_hamburg.corpora.validation;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16 import org.xml.sax.SAXException;
17 import org.xml.sax.SAXParseException;
18 import org.xml.sax.ErrorHandler;
29 private String currentFileName;
31 final String COMA_XSD =
"coma-validate-xsd";
48 private void storeException(SAXParseException saxpe) {
50 String msg = saxpe.getMessage();
51 String idrefPattern =
"IDREF [\"']([^\"']*)[\"']";
52 Pattern idrefRE = Pattern.compile(idrefPattern);
53 Matcher idrefm = idrefRE.matcher(msg);
54 String idvaluePattern =
"ID value [\"']([^\"']*)[\"']";
55 Pattern idvalueRE = Pattern.compile(idvaluePattern);
56 Matcher idvaluem = idvalueRE.matcher(msg);
57 String elementPattern =
"[Ee]lement [\"']([^\"']*)[\"']";
58 Pattern elementRE = Pattern.compile(elementPattern);
59 Matcher elementm = elementRE.matcher(msg);
60 String attributePattern =
"[Aa]ttribute [\"']([^\"']*)[\"']";
61 Pattern attributeRE = Pattern.compile(attributePattern);
62 Matcher attributem = attributeRE.matcher(msg);
63 String valuePattern =
"[Vv]alue [\"']([^\"']*)[\"']";
64 Pattern valueRE = Pattern.compile(valuePattern);
65 Matcher valuem = valueRE.matcher(msg);
66 String value4Pattern =
"[Vv]alue for [\"']([^\"']*)[\"']";
67 Pattern value4RE = Pattern.compile(value4Pattern);
68 Matcher value4m = value4RE.matcher(msg);
70 if (msg.contains(
"cvc-complex-type.4")) {
71 if ((attributem.find() && elementm.find())) {
72 if (attributem.group(1).equals(
"schemaVersion")) {
74 "schemaVersion is missing from coma file",
75 "This can be safely ignored.");
76 }
else if ((attributem.group(1).equals(
"Id")) &&
77 (elementm.group(1).equals(
"Recording"))) {
79 "This recording is missing an ID.",
80 "Change Name into ID or create a new ID.");
81 }
else if ((attributem.group(1).equals(
"Id")) &&
82 (elementm.group(1).equals(
"Media"))) {
84 "This media is missing an ID.",
86 }
else if ((attributem.group(1).equals(
"Id")) &&
87 (elementm.group(1).equals(
"File"))) {
89 "This file is missing an ID.",
93 "This " + elementm.group(1) +
94 " is missing " + attributem.group(1) +
".");
96 }
else if (elementm.find()) {
98 "Unknown error with missing attribute in " +
102 "Unknown error with missing attribute");
104 }
else if (msg.contains(
"cvc-id.1")) {
105 if (idrefm.find() && elementm.find()) {
107 "The " + elementm.group(1) +
" with ID code " +
108 idrefm.group(1) +
" is missing.",
109 "Add a new <" + elementm.group(1) +
" Id=" 110 + idrefm.group(1) +
"'>.");
111 }
else if (elementm.find()) {
113 "An " + elementm.group(1) +
" is missing.",
114 "Add a new <" + elementm.group(1) +
" ...'>.");
115 }
else if (idrefm.find()) {
117 "A speaker, transcription or some file with ID " +
118 idrefm.group(1) +
" is missing.",
119 "Add a new element with Id=" + idrefm.group(1) +
123 "There is an unmatched ID here. Could be a speaker, " +
124 "transcription, or some other file.",
125 "Need to add something that would match the ID.");
127 }
else if (msg.contains(
"cvc-id.2")) {
131 idrefm.group(1) +
" is defined in two or more places.",
132 "Remove all but one, or change ID's.");
133 }
else if (idvaluem.find()) {
136 idvaluem.group(1) +
" is defined in two or more places.",
137 "Remove all but one, or change ID's.");
140 "There's a duplicate ID here. ",
141 "Check and remove duplicates, or change the ID's.");
143 }
else if (msg.contains(
"cvc-type.3.1.3")) {
144 if (valuem.find() && elementm.find()) {
145 if (valuem.equals(
"")) {
147 elementm.group(1) +
" cannot be empty or undefined.",
148 "Fill in some value.");
149 }
else if (elementm.group(1).equals(
"PeriodStart")) {
151 "Period start is formatted wrong.",
152 "Reformat in YYYY-MM-DD format.");
153 }
else if (elementm.group(1).equals(
"Person")) {
155 "Person ID is formatted wrong.",
156 "Rewrite ID to contain only A-Z, 0-9, and hyphens.");
159 elementm.group(1) +
" has invalid value " +
160 valuem.group(1) +
".");
164 "Invalid value somewhere");
166 }
else if (msg.contains(
"cvc-datatype-valid.1.2.1")) {
167 if (valuem.find() && value4m.find()) {
168 if (valuem.group(1).equals(
"")) {
170 value4m.group(1) +
" cannot be empty or undefined. ",
171 "It needs to be filled in.");
172 }
else if (msg.contains(
"'' is not a valid value")) {
174 value4m.group(1) +
" is empty when it shouldn't",
175 "It needs to be filled in");
176 }
else if (value4m.group(1).equals(
"NCName")) {
178 "Name ID is not correct. ",
179 "It should be an ID of letters, dashes and numbers");
180 }
else if (value4m.group(1).equals(
"dateTime")) {
182 "Date here is formatted wrong.",
183 "Write it as YYYY-MM-DD.");
186 value4m.group(1) +
" is not here",
187 "See the original error message.");
189 }
else if (value4m.find()) {
190 if (msg.contains(
"'' is not a valid value")) {
192 value4m.group(1) +
" is empty when it shouldn't",
193 "It needs to be filled in");
196 "Invalid value for " + value4m.group(1));
200 "Invalid value somewhere??");
202 }
else if (msg.contains(
"cvc-complex-type.2.3")) {
204 "There's text in the file where there should be none. ",
205 "if you edited file in text editor, " +
206 "check for odd characters including special space marks," 207 +
"remove # signs outside the content and use <!-- --> " 210 }
else if ((msg.contains(
"cvc-complex-type.2.4.a")) ||
211 (msg.contains(
"cvc-complex-type.2.4.d"))) {
212 if (elementm.find()) {
213 if (elementm.group(1).equals(
"Description")) {
215 "Description elements are in wrong order here.",
216 "This can be ignored.");
217 }
else if (elementm.group(1).equals(
"DBNode")) {
219 "DBNode elements are in wrong order here.",
220 "This can be ignored");
221 }
else if (elementm.group(1).equals(
"RecordingDuration")) {
223 "Recording duration cannot be figured out properly.",
224 "Are there more than one conflicting durations?");
225 }
else if (elementm.group(1).equals(
"role")) {
227 "Roles have been deprecated.",
228 "You can remove it from the file, it is not used." 231 }
else if (elementm.group(1).equals(
"AsocFile")) {
233 "AsocFile is not valid element type here.",
234 "AsocFile should be renamed to AssociatedFile?" 236 }
else if (elementm.group(1).equals(
"NSLink")) {
238 "NSLink is not valid element type here.",
239 "<NSLink should be replaced with a <File... " 241 }
else if (elementm.group(1).equals(
"Filename")) {
243 "Filename is somewhat out of order.",
244 "Probably doesn't matter if file was found.");
245 }
else if (elementm.group(1).equals(
"Name")) {
247 "Name value cannot be associated to correct entity.",
248 " It may be copy/pasted to a wrong place?");
249 }
else if (elementm.group(1).equals(
"Key")) {
251 "Key value cannot be associated to correct entity.",
252 " It may be copy/pasted to a wrong place?");
253 }
else if (elementm.group(1).equals(
"Postalcode")) {
255 "Postalcode comes in wrong order here",
256 "This can be safely ignored.");
257 }
else if (elementm.group(1).equals(
"Media")) {
259 "Media comes in wrong order here",
260 "This probably doesn't matter");
263 elementm.group(1) +
" is in wrong place in the file.",
264 "This probably doesn't matter but might get misplaced");
268 "Unexpected element error stuff",
269 "See the original error message.");
271 }
else if (msg.contains(
"cvc-complex-type.2.4.b")) {
272 if (elementm.find()) {
273 if (elementm.group(1).equals(
"Language")) {
275 "Language elements are missing or partial here.",
276 "You should add or replace <Languages");
277 }
else if (elementm.group(1).equals(
"File")) {
279 "File information is missing or partial here.",
280 "You should add absolute path, URL or so.");
283 elementm.group(1) +
" is missing some of needed data",
284 "See the original error message.");
288 "Unexpected element error stuff");
290 }
else if (msg.contains(
"cvc-complex-type.3.2.2")) {
291 if (attributem.find() && elementm.find()) {
292 if ((attributem.group(1).equals(
"Name")) &&
293 (elementm.group(1).equals(
"Recording"))) {
295 "There's an extra Name in this Recording.",
296 "This name should probably be id... " +
297 "see for other warnings.");
298 }
else if ((attributem.group(1).equals(
"type")) &&
299 (elementm.group(1).equals(
"Location"))) {
301 "There's an extraneous type= for this Location.",
302 "Location types may be ignored, when in attribute only." 304 }
else if ((attributem.group(1).equals(
"type")) &&
305 (elementm.group(1).equals(
"Language"))) {
307 "There's an extraneous type= for this Language.",
308 "Language types may be ignored, when in attribute only." 310 }
else if ((attributem.group(1).equals(
"xsi")) ||
311 (attributem.group(1).equals(
"noNamespaceSchemaLocation"))) {
313 "The namespaces are missing from XML! ",
314 "This file may be really broken...");
317 "Unexpected attribute " + attributem.group(1) +
318 " in " + elementm.group(1));
322 "Unexpected attribute error stuff");
324 }
else if (msg.contains(
"cvc-attribute.3")) {
325 if (attributem.find() && valuem.find()) {
326 if (attributem.group(1).equals(
"Id")) {
328 "This ID " + valuem.group(1) +
329 " doesn't validate (XML-wise)",
330 "May be still used if matches neatly for HZSK uses.");
333 "This value " + valuem.group(1) +
334 " is invalid for " + attributem.group(1),
335 "May need manual corrections.");
339 " Attribute value error");
343 "Totally unrecognised validation error.");
351 public void fatalError(SAXParseException saxpe)
throws SAXException {
352 storeException(saxpe);
359 public void error(SAXParseException saxpe)
throws SAXException {
360 storeException(saxpe);
367 public void warning(SAXParseException saxpe)
throws SAXException {
368 storeException(saxpe);
void addNote(String statId, String description)
void warning(SAXParseException saxpe)
void addCritical(String description)
void addWarning(String statId, String description)
void error(SAXParseException saxpe)
ComaErrorReportGenerator()
void addException(Throwable e, String description)
void fatalError(SAXParseException saxpe)