9 package de.uni_hamburg.corpora.validation;
16 import java.io.FileInputStream;
17 import java.io.FileNotFoundException;
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.Collection;
21 import java.util.HashSet;
23 import java.util.List;
24 import java.util.Stack;
25 import javax.xml.parsers.DocumentBuilder;
26 import javax.xml.parsers.DocumentBuilderFactory;
27 import javax.xml.parsers.ParserConfigurationException;
28 import org.apache.commons.cli.Option;
29 import org.w3c.dom.Document;
30 import org.w3c.dom.Element;
31 import org.w3c.dom.Node;
32 import org.w3c.dom.NodeList;
33 import org.w3c.dom.Text;
34 import org.xml.sax.SAXException;
37 import java.net.URISyntaxException;
46 String referencePath =
"./";
51 final List<String> whitelist;
52 final List<String> fileendingwhitelist;
53 final List<String> directorywhitelist;
59 whitelist =
new ArrayList<String>();
60 whitelist.add(
".git");
61 whitelist.add(
".gitignore");
62 whitelist.add(
"README");
63 whitelist.add(
"README.md");
64 whitelist.add(
".gitattributes");
65 whitelist.add(
"Thumbs.db");
66 fileendingwhitelist =
new ArrayList<String>();
67 directorywhitelist =
new ArrayList<String>();
68 directorywhitelist.add(
"curation");
69 directorywhitelist.add(
"resources");
70 directorywhitelist.add(
"metadata");
72 directorywhitelist.add(
"corpus-utilities");
73 directorywhitelist.add(
"corpus-materials");
81 throws SAXException, IOException, ParserConfigurationException, URISyntaxException {
84 String[] path =
new String[1];
85 path[0] = cd.getURL().toString().substring(5);
87 "Checks Exmaralda .coma file against directory, to find " 88 +
"undocumented files",
89 "If input is a directory, performs recursive check " 90 +
"from that directory, otherwise checks input file");
93 System.out.println(
"Checking coma file against directory...");
97 System.out.println(
" * " + f.getName());
100 comaLoc = f.getName();
102 referencePath =
"./";
103 if (f.getParentFile() != null) {
104 referenceFile = f.getParentFile();
105 referencePath = f.getParentFile().getCanonicalPath();
107 Set<String> allFilesPaths =
new HashSet<String>();
109 Stack<File> dirs =
new Stack<File>();
112 while (!dirs.empty()) {
113 File files[] = dirs.pop().listFiles();
114 for (File a : files) {
115 if (whitelist.contains(a.getName()) || fileendingwhitelist.contains(getFileExtension(a)) || directorywhitelist.contains(a.getParentFile().getName()) || directorywhitelist.contains(a.getParentFile().getParentFile().getName())) {
117 }
else if (a.isDirectory()) {
119 }
else if (a.getName().endsWith(
".coma")) {
121 if (comacounter > 1) {
122 stats.
addCritical(
function, cd,
"There is more than one coma file in your corpus " + a.getName());
124 System.out.println(comacounter);
127 String relPath = stripPrefix(a.getCanonicalPath(),
129 if (relPath.equals(a.getCanonicalPath())) {
130 System.out.println(
"Cannot figure out relative path" 131 +
" for: " + a.getCanonicalPath());
132 stats.
addCritical(
function, cd,
"Cannot figure out relative path" 133 +
" for: " + a.getCanonicalPath());
135 allFilesPaths.add(relPath);
142 Stack<File> dirs =
new Stack();
145 while (!dirs.empty()) {
146 File files[] = dirs.pop().listFiles();
147 for (File b : files) {
148 if (whitelist.contains(b.getName()) || fileendingwhitelist.contains(getFileExtension(b)) || directorywhitelist.contains(b.getParentFile().getName()) || directorywhitelist.contains(b.getParentFile().getParentFile().getName())) {
150 }
else if (b.isDirectory()) {
152 }
else if (b.getName().endsWith(
".coma")) {
154 if (comacounter > 1) {
155 stats.
addCritical(
function, cd,
"There is more than one coma file in your corpus " + b.getName());
157 System.out.println(comacounter);
160 String relPath = stripPrefix(b.getCanonicalPath(),
162 if (relPath.equals(b.getCanonicalPath())) {
163 System.out.println(
"Cannot figure out relative path" 164 +
" for: " + b.getCanonicalPath());
165 stats.
addCritical(
function, cd,
"Cannot figure out relative path" 166 +
" for: " + b.getCanonicalPath());
168 allFilesPaths.add(relPath);
174 if (allFilesPaths.size() == 0) {
175 Stack<File> dirs =
new Stack();
176 dirs.add(referenceFile);
177 String prefix = referencePath;
178 while (!dirs.empty()) {
179 File files[] = dirs.pop().listFiles();
180 for (File c : files) {
181 if (whitelist.contains(c.getName()) || fileendingwhitelist.contains(getFileExtension(c)) || directorywhitelist.contains(c.getParentFile().getName()) || directorywhitelist.contains(c.getParentFile().getParentFile().getName())) {
183 }
else if (c.isDirectory()) {
185 }
else if (c.getName().endsWith(
".coma")) {
187 if (comacounter > 1) {
188 stats.
addCritical(
function, cd,
"There is more than one coma file in your corpus " + c.getName());
190 System.out.println(comacounter);
193 String relPath = stripPrefix(c.getCanonicalPath(),
195 if (relPath.equals(c.getCanonicalPath())) {
196 System.out.println(
"Cannot figure out relative path" 197 +
" for: " + c.getCanonicalPath());
198 stats.
addCritical(
function, cd,
"Cannot figure out relative path" 199 +
" for: " + c.getCanonicalPath());
201 allFilesPaths.add(relPath);
207 Set<String> NSLinksPaths =
new HashSet<String>();
208 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
209 DocumentBuilder db = dbf.newDocumentBuilder();
211 NodeList nslinks = doc.getElementsByTagName(
"NSLink");
212 for (
int i = 0; i < nslinks.getLength(); i++) {
213 Element nslink = (Element) nslinks.item(i);
214 NodeList nstexts = nslink.getChildNodes();
215 for (
int j = 0; j < nstexts.getLength(); j++) {
216 Node maybeText = nstexts.item(j);
217 if (maybeText.getNodeType() != Node.TEXT_NODE) {
218 System.out.println(
"This is not a text node: " 222 Text nstext = (Text) nstexts.item(j);
223 String nspath = nstext.getWholeText();
226 nspath = nspath.replace(
'/', File.separatorChar);
228 NSLinksPaths.add(nspath);
231 Set<String> RelPaths =
new HashSet<String>();
232 NodeList relpathnodes = doc.getElementsByTagName(
"relPath");
233 for (
int i = 0; i < relpathnodes.getLength(); i++) {
234 Element relpathnode = (Element) relpathnodes.item(i);
235 NodeList reltexts = relpathnode.getChildNodes();
236 for (
int j = 0; j < reltexts.getLength(); j++) {
237 Node maybeText = reltexts.item(j);
238 if (maybeText.getNodeType() != Node.TEXT_NODE) {
239 System.out.println(
"This is not a text node: " 243 Text reltext = (Text) reltexts.item(j);
244 String relpath = reltext.getWholeText();
247 relpath = relpath.replace(
'/', File.separatorChar);
248 System.out.println(relpath);
249 RelPaths.add(relpath);
252 Set<String> comaPaths =
new HashSet<String>(NSLinksPaths);
253 comaPaths.addAll(RelPaths);
254 for (String st : allFilesPaths) {
255 if (comaPaths.contains(st)) {
256 stats.
addCorrect(
function, cd,
"File both in coma and filesystem: " + st);
258 stats.
addCritical(
function, cd,
"File on filesystem is not explained in coma: " + st);
261 }
catch (FileNotFoundException fnfe) {
262 fnfe.printStackTrace();
263 }
catch (IOException ioe) {
264 ioe.printStackTrace();
270 private String stripPrefix(String path, String prefix) {
271 return path.replaceFirst(
"^" + prefix.replace(
"\\",
"\\\\")
272 + File.separator.replace(
"\\",
"\\\\"),
"");
284 Class cl = Class.forName(
"de.uni_hamburg.corpora.ComaData");
286 }
catch (ClassNotFoundException ex) {
297 fileendingwhitelist.add(s);
300 private String getFileExtension(File f) {
301 String extension =
"";
302 String fileName = f.getName();
303 int i = fileName.lastIndexOf(
'.');
304 int p = Math.max(fileName.lastIndexOf(
'/'), fileName.lastIndexOf(
'\\'));
307 extension = fileName.substring(i + 1);
318 String description =
"This class is a validator for Coma file references;" 319 +
" it checks Exmaralda coma file for file references if a referenced " 320 +
"file does not exist, issues a warning;";
325 public Report function(
Corpus c, Boolean fix)
throws SAXException, IOException, ParserConfigurationException, URISyntaxException {
327 cd = c.getComaData();
328 stats =
function(cd, fix);
CommandLine handleCommandLine(String[] args, List< Option > extraOptions)
void addFileEndingWhiteListString(String s)
Collection< File > getInputFiles()
ComaFileCoverageChecker()
void addCritical(String description)
static String InputStream2String(InputStream is)
void addWhiteListString(String s)
void addCorrect(String statId, String description)
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)
Collection< Class<?extends CorpusData > > getIsUsableFor()