corpus-services  1.0
ExbPatternChecker.java
Go to the documentation of this file.
1 
11 package de.uni_hamburg.corpora.validation;
12 
14 import java.io.IOException;
15 import java.io.File;
16 import java.util.List;
17 import java.util.ArrayList;
18 import java.util.regex.Pattern;
19 import java.util.regex.Matcher;
20 import javax.xml.parsers.DocumentBuilder;
21 import javax.xml.parsers.DocumentBuilderFactory;
22 import javax.xml.parsers.ParserConfigurationException;
23 
24 import org.apache.commons.cli.Option;
25 import org.apache.commons.cli.CommandLine;
26 import org.xml.sax.SAXException;
27 import org.w3c.dom.Document;
28 import org.w3c.dom.Element;
29 import org.w3c.dom.Node;
30 import org.w3c.dom.NodeList;
31 import org.w3c.dom.Text;
32 
33 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
34 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
35 
39 public class ExbPatternChecker {
40 
41  BasicTranscription bt;
42  ValidatorSettings settings;
43  List<String> conventions = new ArrayList<String>();
44  List<String> problems = new ArrayList<String>();
45 
46  final String function = "exb-patterns";
47 
48  private void tryLoadBasicTranscription(String filename)
49  throws SAXException, JexmaraldaException {
50  if (bt == null) {
51  bt = new BasicTranscription(filename);
52  }
53  }
54 
55  public Report check(File f) {
56  Report stats = new Report();
57  try {
58  stats = exceptionalCheck(f);
59  } catch (ParserConfigurationException pce) {
60  stats.addException(pce, "Unknown parser error");
61  } catch (SAXException saxe) {
62  stats.addException(saxe, "Unknown parser error");
63  } catch (IOException ioe) {
64  stats.addException(ioe, "Unknown read error");
65  }
66  return stats;
67  }
68 
69  public Report exceptionalCheck(File f)
70  throws SAXException, IOException, ParserConfigurationException {
71  // XXX: get conventions from settings somehow
72  List<Pattern> correctPatterns = new ArrayList<Pattern>();
73  for (String convention : conventions) {
74  correctPatterns.add(Pattern.compile(convention));
75  }
76  List<Pattern> errorPatterns = new ArrayList<Pattern>();
77  for (String problem : problems) {
78  errorPatterns.add(Pattern.compile(problem));
79  }
80  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
81  DocumentBuilder db = dbf.newDocumentBuilder();
82  Document doc = db.parse(f);
83  NodeList events = doc.getElementsByTagName("event");
84  Report stats = new Report();
85  for (int i = 0; i < events.getLength(); i++) {
86  Element event = (Element)events.item(i);
87  NodeList eventTexts = event.getChildNodes();
88  for (int j = 0; j < eventTexts.getLength(); j++) {
89  Node maybeText = eventTexts.item(j);
90  if (maybeText.getNodeType() != Node.TEXT_NODE) {
91  if (maybeText.getNodeType() == Node.ELEMENT_NODE &&
92  maybeText.getNodeName().equals("ud-information")) {
93  // XXX: ud-information is weird I'll just skip it...
94  continue;
95  }
96  System.err.println("This is not a text node: " +
97  maybeText);
98  continue;
99  }
100  Text eventText = (Text) maybeText;
101  String text = eventText.getWholeText();
102  int k = 0;
103  for (Pattern pattern : correctPatterns) {
104  Matcher matcher = pattern.matcher(text);
105  if (!matcher.matches()) {
106  stats.addCritical(function,
107  "Text: " + text + " does not fit to the " +
108  "conventions given.", "Expression was: " +
109  conventions.get(k));
110  }
111  }
112  k = 0;
113  for (Pattern pattern : errorPatterns) {
114  Matcher matcher = pattern.matcher(text);
115  if (matcher.matches()) {
116  stats.addCritical(function,
117  "Text: " + text + " does not fit to the " +
118  "conventions given.", "Expression was: " +
119  errorPatterns.get(k));
120  }
121  }
122  }
123  }
124  return stats;
125  }
126 
127  public Report doMain(String[] args) {
128  settings = new ValidatorSettings("ExbPatternChecker",
129  "Checks Exmaralda .exb file annotations for conventions using " +
130  "patterns", "If input is a directory, performs recursive check "
131  + "from that directory, otherwise checks input file\n" +
132  "Patterns are given as regular expressions to match against " +
133  "(regular expression is compiled with java.util.regex)");
134  // XXX: the option version is quite useless unless for quick checks
135  List<Option> patternOptions = new ArrayList<Option>();
136  patternOptions.add(new Option("a", "accept", true, "add an acceptable "
137  + "pattern"));
138  patternOptions.add(new Option("d", "disallow", true, "add an illegal "
139  + "pattern"));
140  CommandLine cmd = settings.handleCommandLine(args, patternOptions);
141  if (cmd == null) {
142  System.exit(0);
143  }
144  if (!cmd.hasOption("accept") && !cmd.hasOption("disallow")) {
145  System.err.println("Nothing to accept or disallow, " +
146  "skipping checks.");
147  System.exit(0);
148  }
149  if (cmd.hasOption("accept")) {
150  conventions.add(cmd.getOptionValue("accept"));
151  }
152  if (cmd.hasOption("disallow")) {
153  problems.add(cmd.getOptionValue("disallow"));
154  }
155  if (settings.isVerbose()) {
156  System.out.println("Checking exb files for unconventional " +
157  "annotations...");
158  }
159  Report stats = new Report();
160  for (File f : settings.getInputFiles()) {
161  if (settings.isVerbose()) {
162  System.out.println(" * " + f.getName());
163  }
164  stats = check(f);
165  }
166  return stats;
167  }
168 
169  public static void main(String[] args) {
170  ExbPatternChecker checker = new ExbPatternChecker();
171  Report stats = checker.doMain(args);
172  System.out.println(stats.getSummaryLines());
173  System.out.println(stats.getErrorReports());
174  }
175 }
CommandLine handleCommandLine(String[] args, List< Option > extraOptions)
void addCritical(String description)
Definition: Report.java:104
void addException(Throwable e, String description)
Definition: Report.java:287