9 package de.uni_hamburg.corpora.validation;
17 import java.io.FileOutputStream;
18 import java.io.IOException;
19 import java.net.URISyntaxException;
21 import java.nio.file.Files;
22 import java.nio.file.Paths;
23 import java.util.ArrayList;
24 import java.util.Collection;
25 import java.util.HashMap;
26 import java.util.HashSet;
27 import java.util.Hashtable;
29 import java.util.List;
30 import java.util.logging.Level;
31 import java.util.logging.Logger;
33 import java.util.regex.Matcher;
34 import java.util.regex.Pattern;
35 import javax.xml.parsers.DocumentBuilder;
36 import javax.xml.parsers.DocumentBuilderFactory;
37 import javax.xml.parsers.ParserConfigurationException;
39 import org.apache.commons.cli.Option;
40 import org.apache.commons.cli.CommandLine;
41 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
42 import org.jdom.JDOMException;
43 import org.w3c.dom.Document;
44 import org.xml.sax.SAXException;
56 final String FILENAME_CONVENTIONS =
"filename-conventions";
67 stats = oldExceptionalCheck(rootdir);
68 }
catch (IOException ioe) {
74 private Report oldExceptionalCheck(File rootdir)
77 return oldRecursiveCheck(rootdir, stats);
80 private Report oldRecursiveCheck(File f,
81 Report stats)
throws IOException {
82 String filename = f.getName();
83 Matcher matchAccepting = acceptable.matcher(filename);
84 boolean allesGut =
true;
85 if (!matchAccepting.matches()) {
87 filename +
" does not follow " 88 +
"filename conventions for HZSK corpora");
91 Matcher matchUnaccepting = unacceptable.matcher(filename);
92 if (matchUnaccepting.find()) {
94 filename +
" contains " 95 +
"characters that may break in HZSK repository");
101 filename +
" is OK by HZSK standards.");
103 if (f.isDirectory()) {
104 File[] files = f.listFiles();
105 for (File g : files) {
106 stats = oldRecursiveCheck(g, stats);
114 "Checks Exmaralda .coma file against directory, to find " 115 +
"undocumented files",
116 "If input is a directory, performs recursive check " 117 +
"from that directory, otherwise checks input file");
118 List<Option> patternOptions =
new ArrayList<Option>();
119 patternOptions.add(
new Option(
"a",
"accept",
true,
"add an acceptable " 121 patternOptions.add(
new Option(
"d",
"disallow",
true,
"add an illegal " 127 if (cmd.hasOption(
"accept")) {
128 acceptable = Pattern.compile(cmd.getOptionValue(
"accept"));
130 acceptable = Pattern.compile(
"^[A-Za-z0-9_.-]*$");
132 if (cmd.hasOption(
"disallow")) {
133 unacceptable = Pattern.compile(cmd.getOptionValue(
"disallow"));
135 unacceptable = Pattern.compile(
"[ üäöÜÄÖ]");
138 System.out.println(
"Checking coma file against directory...");
143 System.out.println(
" * " + f.getName());
150 public static void main(String[] args) {
166 stats = exceptionalCheck(cd);
167 }
catch (ParserConfigurationException pce) {
168 stats.
addException(pce, fileLoc +
": Unknown parsing error");
169 }
catch (SAXException saxe) {
170 stats.
addException(saxe, fileLoc +
": Unknown parsing error");
171 }
catch (IOException ioe) {
172 stats.
addException(ioe, fileLoc +
": Unknown file reading error");
173 }
catch (URISyntaxException ex) {
174 stats.
addException(ex, fileLoc +
": Unknown file reading error");
186 throws SAXException, IOException, ParserConfigurationException, URISyntaxException {
187 File f =
new File(cd.
getURL().toString());
188 String filename = f.getName();
189 File fp = f.getParentFile().getParentFile();
190 String[] path =
new String[1];
191 path[0] = fp.getPath().substring(6);
193 "Checks Exmaralda .coma file against directory, to find " 194 +
"undocumented files",
195 "If input is a directory, performs recursive check " 196 +
"from that directory, otherwise checks input file");
197 List<Option> patternOptions =
new ArrayList<Option>();
198 patternOptions.add(
new Option(
"a",
"accept",
true,
"add an acceptable " 200 patternOptions.add(
new Option(
"d",
"disallow",
true,
"add an illegal " 206 if (cmd.hasOption(
"accept")) {
207 acceptable = Pattern.compile(cmd.getOptionValue(
"accept"));
209 acceptable = Pattern.compile(
"^[A-Za-z0-9_.-]*$");
211 if (cmd.hasOption(
"disallow")) {
212 unacceptable = Pattern.compile(cmd.getOptionValue(
"disallow"));
214 unacceptable = Pattern.compile(
"[ üäöÜÄÖ]");
217 System.out.println(
"Checking coma file against directory...");
221 Matcher matchAccepting = acceptable.matcher(filename);
222 boolean allesGut =
true;
223 if (!matchAccepting.matches()) {
225 filename +
" does not follow " 226 +
"filename conventions for HZSK corpora");
227 exmaError.addError(FILENAME_CONVENTIONS, cd.
getURL().getFile(),
"",
"",
false,
"Error: " + filename +
" does not follow " 228 +
"filename conventions for HZSK corpora");
231 Matcher matchUnaccepting = unacceptable.matcher(filename);
232 if (matchUnaccepting.find()) {
234 filename +
" contains " 235 +
"characters that may break in HZSK repository");
236 exmaError.addError(FILENAME_CONVENTIONS, cd.
getURL().getFile(),
"",
"",
false,
"Error: " + filename +
" contains " 237 +
"characters that may break in HZSK repository");
243 filename +
" is OK by HZSK standards.");
254 "File names which do not comply with conventions cannot be fixed automatically");
266 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
267 Class clSecond = Class.forName(
"de.uni_hamburg.corpora.UnspecifiedXMLData");
268 Class clThird = Class.forName(
"de.uni_hamburg.corpora.ComaData");
270 IsUsableFor.add(clSecond);
271 IsUsableFor.add(clThird);
272 }
catch (ClassNotFoundException ex) {
273 Logger.getLogger(
FilenameChecker.class.getName()).log(Level.SEVERE, null, ex);
CommandLine handleCommandLine(String[] args, List< Option > extraOptions)
Collection< File > getInputFiles()
Report check(CorpusData cd)
Collection< Class<?extends CorpusData > > getIsUsableFor()
static ExmaErrorList exmaError
void addCritical(String description)
Report oldCheck(File rootdir)
static void main(String[] args)
void addWarning(String statId, String description)
void addCorrect(String statId, String description)
Report doMain(String[] args)
void addException(Throwable e, String description)
Report fix(CorpusData cd)