hzsk-corpus-services  0.2
FilenameChecker.java
Go to the documentation of this file.
1 
9 package de.uni_hamburg.corpora.validation;
10 
16 import java.io.File;
17 import java.io.FileOutputStream;
18 import java.io.IOException;
19 import java.net.URISyntaxException;
20 import java.net.URL;
21 import java.nio.file.Files;
22 import java.nio.file.Paths;
23 import java.util.ArrayList;
24 import java.util.Collection;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.Hashtable;
28 import java.util.Set;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
32 import java.util.Map;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35 import javax.xml.parsers.DocumentBuilder;
36 import javax.xml.parsers.DocumentBuilderFactory;
37 import javax.xml.parsers.ParserConfigurationException;
39 import org.apache.commons.cli.Option;
40 import org.apache.commons.cli.CommandLine;
41 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
42 import org.jdom.JDOMException;
43 import org.w3c.dom.Document;
44 import org.xml.sax.SAXException;
45 
50 public class FilenameChecker extends Checker implements CorpusFunction {
51 
52  Pattern acceptable;
53  Pattern unacceptable;
54  ValidatorSettings settings;
55 
56  final String FILENAME_CONVENTIONS = "filename-conventions";
57  String fileLoc = "";
58 
64  public Report oldCheck(File rootdir) {
65  Report stats = new Report();
66  try {
67  stats = oldExceptionalCheck(rootdir);
68  } catch (IOException ioe) {
69  stats.addException(ioe, "Unknown reading error");
70  }
71  return stats;
72  }
73 
74  private Report oldExceptionalCheck(File rootdir)
75  throws IOException {
76  Report stats = new Report();
77  return oldRecursiveCheck(rootdir, stats);
78  }
79 
80  private Report oldRecursiveCheck(File f,
81  Report stats) throws IOException {
82  String filename = f.getName();
83  Matcher matchAccepting = acceptable.matcher(filename);
84  boolean allesGut = true;
85  if (!matchAccepting.matches()) {
86  stats.addWarning(FILENAME_CONVENTIONS,
87  filename + " does not follow "
88  + "filename conventions for HZSK corpora");
89  allesGut = false;
90  }
91  Matcher matchUnaccepting = unacceptable.matcher(filename);
92  if (matchUnaccepting.find()) {
93  stats.addWarning(FILENAME_CONVENTIONS,
94  filename + " contains "
95  + "characters that may break in HZSK repository");
96  allesGut = false;
97  }
98 
99  if (allesGut) {
100  stats.addCorrect(FILENAME_CONVENTIONS,
101  filename + " is OK by HZSK standards.");
102  }
103  if (f.isDirectory()) {
104  File[] files = f.listFiles();
105  for (File g : files) {
106  stats = oldRecursiveCheck(g, stats);
107  }
108  }
109  return stats;
110  }
111 
112  public Report doMain(String[] args) {
113  settings = new ValidatorSettings("FileCoverageChecker",
114  "Checks Exmaralda .coma file against directory, to find "
115  + "undocumented files",
116  "If input is a directory, performs recursive check "
117  + "from that directory, otherwise checks input file");
118  List<Option> patternOptions = new ArrayList<Option>();
119  patternOptions.add(new Option("a", "accept", true, "add an acceptable "
120  + "pattern"));
121  patternOptions.add(new Option("d", "disallow", true, "add an illegal "
122  + "pattern"));
123  CommandLine cmd = settings.handleCommandLine(args, patternOptions);
124  if (cmd == null) {
125  System.exit(0);
126  }
127  if (cmd.hasOption("accept")) {
128  acceptable = Pattern.compile(cmd.getOptionValue("accept"));
129  } else {
130  acceptable = Pattern.compile("^[A-Za-z0-9_.-]*$");
131  }
132  if (cmd.hasOption("disallow")) {
133  unacceptable = Pattern.compile(cmd.getOptionValue("disallow"));
134  } else {
135  unacceptable = Pattern.compile("[ üäöÜÄÖ]");
136  }
137  if (settings.isVerbose()) {
138  System.out.println("Checking coma file against directory...");
139  }
140  Report stats = new Report();
141  for (File f : settings.getInputFiles()) {
142  if (settings.isVerbose()) {
143  System.out.println(" * " + f.getName());
144  }
145  stats = oldCheck(f);
146  }
147  return stats;
148  }
149 
150  public static void main(String[] args) {
151  FilenameChecker checker = new FilenameChecker();
152  Report stats = checker.doMain(args);
153  System.out.println(stats.getSummaryLines());
154  System.out.println(stats.getErrorReports());
155  }
156 
162  @Override
163  public Report check(CorpusData cd) throws SAXException, JexmaraldaException {
164  Report stats = new Report();
165  try {
166  stats = exceptionalCheck(cd);
167  } catch (ParserConfigurationException pce) {
168  stats.addException(pce, fileLoc + ": Unknown parsing error");
169  } catch (SAXException saxe) {
170  stats.addException(saxe, fileLoc + ": Unknown parsing error");
171  } catch (IOException ioe) {
172  stats.addException(ioe, fileLoc + ": Unknown file reading error");
173  } catch (URISyntaxException ex) {
174  stats.addException(ex, fileLoc + ": Unknown file reading error");
175  }
176  return stats;
177  }
178 
185  private Report exceptionalCheck(CorpusData cd)
186  throws SAXException, IOException, ParserConfigurationException, URISyntaxException {
187  File f = new File(cd.getURL().toString());
188  String filename = f.getName();
189  File fp = f.getParentFile().getParentFile();
190  String[] path = new String[1];
191  path[0] = fp.getPath().substring(6);
192  settings = new ValidatorSettings("FileCoverageChecker",
193  "Checks Exmaralda .coma file against directory, to find "
194  + "undocumented files",
195  "If input is a directory, performs recursive check "
196  + "from that directory, otherwise checks input file");
197  List<Option> patternOptions = new ArrayList<Option>();
198  patternOptions.add(new Option("a", "accept", true, "add an acceptable "
199  + "pattern"));
200  patternOptions.add(new Option("d", "disallow", true, "add an illegal "
201  + "pattern"));
202  CommandLine cmd = settings.handleCommandLine(path, patternOptions);
203  if (cmd == null) {
204  System.exit(0);
205  }
206  if (cmd.hasOption("accept")) {
207  acceptable = Pattern.compile(cmd.getOptionValue("accept"));
208  } else {
209  acceptable = Pattern.compile("^[A-Za-z0-9_.-]*$");
210  }
211  if (cmd.hasOption("disallow")) {
212  unacceptable = Pattern.compile(cmd.getOptionValue("disallow"));
213  } else {
214  unacceptable = Pattern.compile("[ üäöÜÄÖ]");
215  }
216  if (settings.isVerbose()) {
217  System.out.println("Checking coma file against directory...");
218  }
219  Report stats = new Report();
220 
221  Matcher matchAccepting = acceptable.matcher(filename);
222  boolean allesGut = true;
223  if (!matchAccepting.matches()) {
224  stats.addWarning(FILENAME_CONVENTIONS,
225  filename + " does not follow "
226  + "filename conventions for HZSK corpora");
227  exmaError.addError(FILENAME_CONVENTIONS, cd.getURL().getFile(), "", "", false, "Error: " + filename + " does not follow "
228  + "filename conventions for HZSK corpora");
229  allesGut = false;
230  }
231  Matcher matchUnaccepting = unacceptable.matcher(filename);
232  if (matchUnaccepting.find()) {
233  stats.addWarning(FILENAME_CONVENTIONS,
234  filename + " contains "
235  + "characters that may break in HZSK repository");
236  exmaError.addError(FILENAME_CONVENTIONS, cd.getURL().getFile(), "", "", false, "Error: " + filename + " contains "
237  + "characters that may break in HZSK repository");
238  allesGut = false;
239  }
240 
241  if (allesGut) {
242  stats.addCorrect(FILENAME_CONVENTIONS,
243  filename + " is OK by HZSK standards.");
244  }
245  return stats;
246  }
247 
251  @Override
252  public Report fix(CorpusData cd) throws SAXException, JDOMException, IOException, JexmaraldaException {
253  report.addCritical(FILENAME_CONVENTIONS,
254  "File names which do not comply with conventions cannot be fixed automatically");
255  return report;
256  }
257 
263  @Override
264  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
265  try {
266  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
267  Class clSecond = Class.forName("de.uni_hamburg.corpora.UnspecifiedXMLData");
268  Class clThird = Class.forName("de.uni_hamburg.corpora.ComaData");
269  IsUsableFor.add(cl);
270  IsUsableFor.add(clSecond);
271  IsUsableFor.add(clThird);
272  } catch (ClassNotFoundException ex) {
273  Logger.getLogger(FilenameChecker.class.getName()).log(Level.SEVERE, null, ex);
274  }
275  return IsUsableFor;
276  }
277 
278 }
CommandLine handleCommandLine(String[] args, List< Option > extraOptions)
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addCritical(String description)
Definition: Report.java:101
void addWarning(String statId, String description)
Definition: Report.java:152
void addCorrect(String statId, String description)
Definition: Report.java:205
void addException(Throwable e, String description)
Definition: Report.java:275