corpus-services  1.0
ComaErrorReportGenerator.java
Go to the documentation of this file.
1 
10 package de.uni_hamburg.corpora.validation;
11 
12 
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16 import org.xml.sax.SAXException;
17 import org.xml.sax.SAXParseException;
18 import org.xml.sax.ErrorHandler;
19 
26 public class ComaErrorReportGenerator implements ErrorHandler {
27 
28  // store latest file name for laughs
29  private String currentFileName;
30  private Report stats;
31  final String COMA_XSD = "coma-validate-xsd";
32 
37  super();
38  stats = new Report();
39  }
40 
41  public Report getErrors() {
42  return stats;
43  }
44 
48  private void storeException(SAXParseException saxpe) {
49  // yeah this hack relies on parsing the localised(?) messages...
50  String msg = saxpe.getMessage();
51  String idrefPattern = "IDREF [\"']([^\"']*)[\"']";
52  Pattern idrefRE = Pattern.compile(idrefPattern);
53  Matcher idrefm = idrefRE.matcher(msg);
54  String idvaluePattern = "ID value [\"']([^\"']*)[\"']";
55  Pattern idvalueRE = Pattern.compile(idvaluePattern);
56  Matcher idvaluem = idvalueRE.matcher(msg);
57  String elementPattern = "[Ee]lement [\"']([^\"']*)[\"']";
58  Pattern elementRE = Pattern.compile(elementPattern);
59  Matcher elementm = elementRE.matcher(msg);
60  String attributePattern = "[Aa]ttribute [\"']([^\"']*)[\"']";
61  Pattern attributeRE = Pattern.compile(attributePattern);
62  Matcher attributem = attributeRE.matcher(msg);
63  String valuePattern = "[Vv]alue [\"']([^\"']*)[\"']";
64  Pattern valueRE = Pattern.compile(valuePattern);
65  Matcher valuem = valueRE.matcher(msg);
66  String value4Pattern = "[Vv]alue for [\"']([^\"']*)[\"']";
67  Pattern value4RE = Pattern.compile(value4Pattern);
68  Matcher value4m = value4RE.matcher(msg);
69  // use error id as first narrow down:
70  if (msg.contains("cvc-complex-type.4")) {
71  if ((attributem.find() && elementm.find())) {
72  if (attributem.group(1).equals("schemaVersion")) {
73  stats.addNote(COMA_XSD, saxpe,
74  "schemaVersion is missing from coma file",
75  "This can be safely ignored.");
76  } else if ((attributem.group(1).equals("Id")) &&
77  (elementm.group(1).equals("Recording"))) {
78  stats.addCritical(COMA_XSD, saxpe,
79  "This recording is missing an ID.",
80  "Change Name into ID or create a new ID.");
81  } else if ((attributem.group(1).equals("Id")) &&
82  (elementm.group(1).equals("Media"))) {
83  stats.addCritical(COMA_XSD, saxpe,
84  "This media is missing an ID.",
85  "Create a new ID.");
86  } else if ((attributem.group(1).equals("Id")) &&
87  (elementm.group(1).equals("File"))) {
88  stats.addCritical(COMA_XSD, saxpe,
89  "This file is missing an ID.",
90  "Create a new ID.");
91  } else {
92  stats.addException(COMA_XSD, saxpe,
93  "This " + elementm.group(1) +
94  " is missing " + attributem.group(1) + ".");
95  }
96  } else if (elementm.find()) {
97  stats.addException(COMA_XSD, saxpe,
98  "Unknown error with missing attribute in " +
99  elementm.group(1));
100  } else {
101  stats.addException(COMA_XSD, saxpe,
102  "Unknown error with missing attribute");
103  }
104  } else if (msg.contains("cvc-id.1")) {
105  if (idrefm.find() && elementm.find()) {
106  stats.addCritical(COMA_XSD, saxpe,
107  "The " + elementm.group(1) + " with ID code " +
108  idrefm.group(1) + " is missing.",
109  "Add a new <" + elementm.group(1) + " Id="
110  + idrefm.group(1) + "'>.");
111  } else if (elementm.find()) {
112  stats.addCritical(COMA_XSD, saxpe,
113  "An " + elementm.group(1) + " is missing.",
114  "Add a new <" + elementm.group(1) + " ...'>.");
115  } else if (idrefm.find()) {
116  stats.addCritical(COMA_XSD, saxpe,
117  "A speaker, transcription or some file with ID " +
118  idrefm.group(1) + " is missing.",
119  "Add a new element with Id=" + idrefm.group(1) +
120  "'>.");
121  } else {
122  stats.addException(COMA_XSD, saxpe,
123  "There is an unmatched ID here. Could be a speaker, " +
124  "transcription, or some other file.",
125  "Need to add something that would match the ID.");
126  }
127  } else if (msg.contains("cvc-id.2")) {
128  if (idrefm.find()) {
129  stats.addCritical(COMA_XSD, saxpe,
130  "The Id code " +
131  idrefm.group(1) + " is defined in two or more places.",
132  "Remove all but one, or change ID's.");
133  } else if (idvaluem.find()) {
134  stats.addCritical(COMA_XSD, saxpe,
135  "The Id code " +
136  idvaluem.group(1) + " is defined in two or more places.",
137  "Remove all but one, or change ID's.");
138  } else {
139  stats.addException(COMA_XSD, saxpe,
140  "There's a duplicate ID here. ",
141  "Check and remove duplicates, or change the ID's.");
142  }
143  } else if (msg.contains("cvc-type.3.1.3")) {
144  if (valuem.find() && elementm.find()) {
145  if (valuem.equals("")) {
146  stats.addCritical(COMA_XSD, saxpe,
147  elementm.group(1) + " cannot be empty or undefined.",
148  "Fill in some value.");
149  } else if (elementm.group(1).equals("PeriodStart")) {
150  stats.addCritical(COMA_XSD, saxpe,
151  "Period start is formatted wrong.",
152  "Reformat in YYYY-MM-DD format.");
153  } else if (elementm.group(1).equals("Person")) {
154  stats.addCritical(COMA_XSD, saxpe,
155  "Person ID is formatted wrong.",
156  "Rewrite ID to contain only A-Z, 0-9, and hyphens.");
157  } else {
158  stats.addException(COMA_XSD, saxpe,
159  elementm.group(1) + " has invalid value " +
160  valuem.group(1) + ".");
161  }
162  } else {
163  stats.addException(COMA_XSD, saxpe,
164  "Invalid value somewhere");
165  }
166  } else if (msg.contains("cvc-datatype-valid.1.2.1")) {
167  if (valuem.find() && value4m.find()) {
168  if (valuem.group(1).equals("")) {
169  stats.addCritical(COMA_XSD, saxpe,
170  value4m.group(1) + " cannot be empty or undefined. ",
171  "It needs to be filled in.");
172  } else if (msg.contains("'' is not a valid value")) {
173  stats.addCritical(COMA_XSD, saxpe,
174  value4m.group(1) + " is empty when it shouldn't",
175  "It needs to be filled in");
176  } else if (value4m.group(1).equals("NCName")) {
177  stats.addCritical(COMA_XSD, saxpe,
178  "Name ID is not correct. ",
179  "It should be an ID of letters, dashes and numbers");
180  } else if (value4m.group(1).equals("dateTime")) {
181  stats.addCritical(COMA_XSD, saxpe,
182  "Date here is formatted wrong.",
183  "Write it as YYYY-MM-DD.");
184  } else {
185  stats.addException(COMA_XSD, saxpe,
186  value4m.group(1) + " is not here",
187  "See the original error message.");
188  }
189  } else if (value4m.find()) {
190  if (msg.contains("'' is not a valid value")) {
191  stats.addCritical(COMA_XSD, saxpe,
192  value4m.group(1) + " is empty when it shouldn't",
193  "It needs to be filled in");
194  } else {
195  stats.addException(COMA_XSD, saxpe,
196  "Invalid value for " + value4m.group(1));
197  }
198  } else {
199  stats.addException(COMA_XSD, saxpe,
200  "Invalid value somewhere??");
201  }
202  } else if (msg.contains("cvc-complex-type.2.3")) {
203  stats.addCritical(COMA_XSD, saxpe,
204  "There's text in the file where there should be none. ",
205  "if you edited file in text editor, " +
206  "check for odd characters including special space marks,"
207  + "remove # signs outside the content and use <!-- --> "
208  + " comments."
209  );
210  } else if ((msg.contains("cvc-complex-type.2.4.a")) ||
211  (msg.contains("cvc-complex-type.2.4.d"))) {
212  if (elementm.find()) {
213  if (elementm.group(1).equals("Description")) {
214  stats.addNote(COMA_XSD, saxpe,
215  "Description elements are in wrong order here.",
216  "This can be ignored.");
217  } else if (elementm.group(1).equals("DBNode")) {
218  stats.addNote(COMA_XSD, saxpe,
219  "DBNode elements are in wrong order here.",
220  "This can be ignored");
221  } else if (elementm.group(1).equals("RecordingDuration")) {
222  stats.addCritical(COMA_XSD, saxpe,
223  "Recording duration cannot be figured out properly.",
224  "Are there more than one conflicting durations?");
225  } else if (elementm.group(1).equals("role")) {
226  stats.addNote(COMA_XSD, saxpe,
227  "Roles have been deprecated.",
228  "You can remove it from the file, it is not used."
229  );
230  // FIXME: the File / Filename / NSLink /URI stuff...
231  } else if (elementm.group(1).equals("AsocFile")) {
232  stats.addWarning(COMA_XSD, saxpe,
233  "AsocFile is not valid element type here.",
234  "AsocFile should be renamed to AssociatedFile?"
235  );
236  } else if (elementm.group(1).equals("NSLink")) {
237  stats.addWarning(COMA_XSD, saxpe,
238  "NSLink is not valid element type here.",
239  "<NSLink should be replaced with a <File... "
240  );
241  } else if (elementm.group(1).equals("Filename")) {
242  stats.addNote(COMA_XSD, saxpe,
243  "Filename is somewhat out of order.",
244  "Probably doesn't matter if file was found.");
245  } else if (elementm.group(1).equals("Name")) {
246  stats.addCritical(COMA_XSD, saxpe,
247  "Name value cannot be associated to correct entity.",
248  " It may be copy/pasted to a wrong place?");
249  } else if (elementm.group(1).equals("Key")) {
250  stats.addCritical(COMA_XSD, saxpe,
251  "Key value cannot be associated to correct entity.",
252  " It may be copy/pasted to a wrong place?");
253  } else if (elementm.group(1).equals("Postalcode")) {
254  stats.addNote(COMA_XSD, saxpe,
255  "Postalcode comes in wrong order here",
256  "This can be safely ignored.");
257  } else if (elementm.group(1).equals("Media")) {
258  stats.addNote(COMA_XSD, saxpe,
259  "Media comes in wrong order here",
260  "This probably doesn't matter");
261  } else {
262  stats.addException(COMA_XSD, saxpe,
263  elementm.group(1) + " is in wrong place in the file.",
264  "This probably doesn't matter but might get misplaced");
265  }
266  } else {
267  stats.addException(COMA_XSD, saxpe,
268  "Unexpected element error stuff",
269  "See the original error message.");
270  }
271  } else if (msg.contains("cvc-complex-type.2.4.b")) {
272  if (elementm.find()) {
273  if (elementm.group(1).equals("Language")) {
274  stats.addCritical(COMA_XSD, saxpe,
275  "Language elements are missing or partial here.",
276  "You should add or replace <Languages");
277  } else if (elementm.group(1).equals("File")) {
278  stats.addNote(COMA_XSD, saxpe,
279  "File information is missing or partial here.",
280  "You should add absolute path, URL or so.");
281  } else {
282  stats.addException(COMA_XSD, saxpe,
283  elementm.group(1) + " is missing some of needed data",
284  "See the original error message.");
285  }
286  } else {
287  stats.addException(COMA_XSD, saxpe,
288  "Unexpected element error stuff");
289  }
290  } else if (msg.contains("cvc-complex-type.3.2.2")) {
291  if (attributem.find() && elementm.find()) {
292  if ((attributem.group(1).equals("Name")) &&
293  (elementm.group(1).equals("Recording"))) {
294  stats.addWarning(COMA_XSD, saxpe,
295  "There's an extra Name in this Recording.",
296  "This name should probably be id... " +
297  "see for other warnings.");
298  } else if ((attributem.group(1).equals("type")) &&
299  (elementm.group(1).equals("Location"))) {
300  stats.addWarning(COMA_XSD, saxpe,
301  "There's an extraneous type= for this Location.",
302  "Location types may be ignored, when in attribute only."
303  );
304  } else if ((attributem.group(1).equals("type")) &&
305  (elementm.group(1).equals("Language"))) {
306  stats.addWarning(COMA_XSD, saxpe,
307  "There's an extraneous type= for this Language.",
308  "Language types may be ignored, when in attribute only."
309  );
310  } else if ((attributem.group(1).equals("xsi")) ||
311  (attributem.group(1).equals("noNamespaceSchemaLocation"))) {
312  stats.addCritical(COMA_XSD, saxpe,
313  "The namespaces are missing from XML! ",
314  "This file may be really broken...");
315  } else {
316  stats.addException(COMA_XSD, saxpe,
317  "Unexpected attribute " + attributem.group(1) +
318  " in " + elementm.group(1));
319  }
320  } else {
321  stats.addException(COMA_XSD, saxpe,
322  "Unexpected attribute error stuff");
323  }
324  } else if (msg.contains("cvc-attribute.3")) {
325  if (attributem.find() && valuem.find()) {
326  if (attributem.group(1).equals("Id")) {
327  stats.addCritical(COMA_XSD, saxpe,
328  "This ID " + valuem.group(1) +
329  " doesn't validate (XML-wise)",
330  "May be still used if matches neatly for HZSK uses.");
331  } else {
332  stats.addException(COMA_XSD, saxpe,
333  "This value " + valuem.group(1) +
334  " is invalid for " + attributem.group(1),
335  "May need manual corrections.");
336  }
337  } else {
338  stats.addException(COMA_XSD, saxpe,
339  " Attribute value error");
340  }
341  } else {
342  stats.addException(COMA_XSD, saxpe,
343  "Totally unrecognised validation error.");
344  }
345  }
346 
351  public void fatalError(SAXParseException saxpe) throws SAXException {
352  storeException(saxpe);
353  }
354 
359  public void error(SAXParseException saxpe) throws SAXException {
360  storeException(saxpe);
361  }
362 
367  public void warning(SAXParseException saxpe) throws SAXException {
368  storeException(saxpe);
369 
370  }
371 }
372 
void addNote(String statId, String description)
Definition: Report.java:245
void addCritical(String description)
Definition: Report.java:104
void addWarning(String statId, String description)
Definition: Report.java:164
void addException(Throwable e, String description)
Definition: Report.java:287