corpus-services  1.0
ComaFileCoverageChecker.java
Go to the documentation of this file.
1 
9 package de.uni_hamburg.corpora.validation;
10 
15 import java.io.File;
16 import java.io.FileInputStream;
17 import java.io.FileNotFoundException;
18 import java.io.IOException;
19 import java.util.ArrayList;
20 import java.util.Collection;
21 import java.util.HashSet;
22 import java.util.Set;
23 import java.util.List;
24 import java.util.Stack;
25 import javax.xml.parsers.DocumentBuilder;
26 import javax.xml.parsers.DocumentBuilderFactory;
27 import javax.xml.parsers.ParserConfigurationException;
28 import org.apache.commons.cli.Option;
29 import org.w3c.dom.Document;
30 import org.w3c.dom.Element;
31 import org.w3c.dom.Node;
32 import org.w3c.dom.NodeList;
33 import org.w3c.dom.Text;
34 import org.xml.sax.SAXException;
35 
37 import java.net.URISyntaxException;
38 
43 public class ComaFileCoverageChecker extends Checker implements CorpusFunction {
44 
45  ValidatorSettings settings;
46  String referencePath = "./";
47  File referenceFile;
48  String comaLoc = "";
49  int comacounter = 0;
50 
51  final List<String> whitelist;
52  final List<String> fileendingwhitelist;
53  final List<String> directorywhitelist;
54 
56  //no fixing available
57  super(false);
58  // these are acceptable
59  whitelist = new ArrayList<String>();
60  whitelist.add(".git");
61  whitelist.add(".gitignore");
62  whitelist.add("README");
63  whitelist.add("README.md");
64  whitelist.add(".gitattributes");
65  whitelist.add("Thumbs.db");
66  fileendingwhitelist = new ArrayList<String>();
67  directorywhitelist = new ArrayList<String>();
68  directorywhitelist.add("curation");
69  directorywhitelist.add("resources");
70  directorywhitelist.add("metadata");
71  //they are not needed before publication
72  directorywhitelist.add("corpus-utilities");
73  directorywhitelist.add("corpus-materials");
74  }
75 
80  public Report function(CorpusData cd, Boolean fix)
81  throws SAXException, IOException, ParserConfigurationException, URISyntaxException {
82  Report stats = new Report();
83  // FIXME:
84  String[] path = new String[1];
85  path[0] = cd.getURL().toString().substring(5);
86  settings = new ValidatorSettings("FileCoverageChecker",
87  "Checks Exmaralda .coma file against directory, to find "
88  + "undocumented files",
89  "If input is a directory, performs recursive check "
90  + "from that directory, otherwise checks input file");
91  settings.handleCommandLine(path, new ArrayList<Option>());
92  if (settings.isVerbose()) {
93  System.out.println("Checking coma file against directory...");
94  }
95  for (File f : settings.getInputFiles()) {
96  if (settings.isVerbose()) {
97  System.out.println(" * " + f.getName());
98  }
99  try {
100  comaLoc = f.getName();
101  String s = TypeConverter.InputStream2String(new FileInputStream(f));
102  referencePath = "./";
103  if (f.getParentFile() != null) {
104  referenceFile = f.getParentFile();
105  referencePath = f.getParentFile().getCanonicalPath();
106  }
107  Set<String> allFilesPaths = new HashSet<String>();
108  if (settings.getDataDirectory() != null) {
109  Stack<File> dirs = new Stack<File>();
110  dirs.add(settings.getDataDirectory());
111  String prefix = settings.getDataDirectory().getCanonicalPath();
112  while (!dirs.empty()) {
113  File files[] = dirs.pop().listFiles();
114  for (File a : files) {
115  if (whitelist.contains(a.getName()) || fileendingwhitelist.contains(getFileExtension(a)) || directorywhitelist.contains(a.getParentFile().getName()) || directorywhitelist.contains(a.getParentFile().getParentFile().getName())) {
116  continue;
117  } else if (a.isDirectory()) {
118  dirs.add(a);
119  } else if (a.getName().endsWith(".coma")) {
120  comacounter++;
121  if (comacounter > 1) {
122  stats.addCritical(function, cd, "There is more than one coma file in your corpus " + a.getName());
123  }
124  System.out.println(comacounter);
125  continue;
126  } else {
127  String relPath = stripPrefix(a.getCanonicalPath(),
128  prefix);
129  if (relPath.equals(a.getCanonicalPath())) {
130  System.out.println("Cannot figure out relative path"
131  + " for: " + a.getCanonicalPath());
132  stats.addCritical(function, cd, "Cannot figure out relative path"
133  + " for: " + a.getCanonicalPath());
134  } else {
135  allFilesPaths.add(relPath);
136  }
137  }
138  }
139  }
140  }
141  if (settings.getBaseDirectory() != null) {
142  Stack<File> dirs = new Stack();
143  dirs.add(settings.getBaseDirectory());
144  String prefix = settings.getBaseDirectory().getCanonicalPath();
145  while (!dirs.empty()) {
146  File files[] = dirs.pop().listFiles();
147  for (File b : files) {
148  if (whitelist.contains(b.getName()) || fileendingwhitelist.contains(getFileExtension(b)) || directorywhitelist.contains(b.getParentFile().getName()) || directorywhitelist.contains(b.getParentFile().getParentFile().getName())) {
149  continue;
150  } else if (b.isDirectory()) {
151  dirs.add(b);
152  } else if (b.getName().endsWith(".coma")) {
153  comacounter++;
154  if (comacounter > 1) {
155  stats.addCritical(function, cd, "There is more than one coma file in your corpus " + b.getName());
156  }
157  System.out.println(comacounter);
158  continue;
159  } else {
160  String relPath = stripPrefix(b.getCanonicalPath(),
161  prefix);
162  if (relPath.equals(b.getCanonicalPath())) {
163  System.out.println("Cannot figure out relative path"
164  + " for: " + b.getCanonicalPath());
165  stats.addCritical(function, cd, "Cannot figure out relative path"
166  + " for: " + b.getCanonicalPath());
167  } else {
168  allFilesPaths.add(relPath);
169  }
170  }
171  }
172  }
173  }
174  if (allFilesPaths.size() == 0) {
175  Stack<File> dirs = new Stack();
176  dirs.add(referenceFile);
177  String prefix = referencePath;
178  while (!dirs.empty()) {
179  File files[] = dirs.pop().listFiles();
180  for (File c : files) {
181  if (whitelist.contains(c.getName()) || fileendingwhitelist.contains(getFileExtension(c)) || directorywhitelist.contains(c.getParentFile().getName()) || directorywhitelist.contains(c.getParentFile().getParentFile().getName())) {
182  continue;
183  } else if (c.isDirectory()) {
184  dirs.add(c);
185  } else if (c.getName().endsWith(".coma")) {
186  comacounter++;
187  if (comacounter > 1) {
188  stats.addCritical(function, cd, "There is more than one coma file in your corpus " + c.getName());
189  }
190  System.out.println(comacounter);
191  continue;
192  } else {
193  String relPath = stripPrefix(c.getCanonicalPath(),
194  prefix);
195  if (relPath.equals(c.getCanonicalPath())) {
196  System.out.println("Cannot figure out relative path"
197  + " for: " + c.getCanonicalPath());
198  stats.addCritical(function, cd, "Cannot figure out relative path"
199  + " for: " + c.getCanonicalPath());
200  } else {
201  allFilesPaths.add(relPath);
202  }
203  }
204  }
205  }
206  }
207  Set<String> NSLinksPaths = new HashSet<String>();
208  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
209  DocumentBuilder db = dbf.newDocumentBuilder();
210  Document doc = db.parse(TypeConverter.String2InputStream(s));
211  NodeList nslinks = doc.getElementsByTagName("NSLink");
212  for (int i = 0; i < nslinks.getLength(); i++) {
213  Element nslink = (Element) nslinks.item(i);
214  NodeList nstexts = nslink.getChildNodes();
215  for (int j = 0; j < nstexts.getLength(); j++) {
216  Node maybeText = nstexts.item(j);
217  if (maybeText.getNodeType() != Node.TEXT_NODE) {
218  System.out.println("This is not a text node: "
219  + maybeText);
220  continue;
221  }
222  Text nstext = (Text) nstexts.item(j);
223  String nspath = nstext.getWholeText();
224  // added this line so it compares Coma NSLinks in the correct format of the OS
225  // it still doesn't work if there are absoulte paths in the NSlinks, but that shouldn#t be the case anyway
226  nspath = nspath.replace('/', File.separatorChar);
227  //System.out.println(nspath);
228  NSLinksPaths.add(nspath);
229  }
230  }
231  Set<String> RelPaths = new HashSet<String>();
232  NodeList relpathnodes = doc.getElementsByTagName("relPath");
233  for (int i = 0; i < relpathnodes.getLength(); i++) {
234  Element relpathnode = (Element) relpathnodes.item(i);
235  NodeList reltexts = relpathnode.getChildNodes();
236  for (int j = 0; j < reltexts.getLength(); j++) {
237  Node maybeText = reltexts.item(j);
238  if (maybeText.getNodeType() != Node.TEXT_NODE) {
239  System.out.println("This is not a text node: "
240  + maybeText);
241  continue;
242  }
243  Text reltext = (Text) reltexts.item(j);
244  String relpath = reltext.getWholeText();
245  // added this line so it compares Coma NSLinks in the correct format of the OS
246  // it still doesn't work if there are absoulte paths in the NSlinks, but that shouldn#t be the case anyway
247  relpath = relpath.replace('/', File.separatorChar);
248  System.out.println(relpath);
249  RelPaths.add(relpath);
250  }
251  }
252  Set<String> comaPaths = new HashSet<String>(NSLinksPaths);
253  comaPaths.addAll(RelPaths);
254  for (String st : allFilesPaths) {
255  if (comaPaths.contains(st)) {
256  stats.addCorrect(function, cd, "File both in coma and filesystem: " + st);
257  } else {
258  stats.addCritical(function, cd, "File on filesystem is not explained in coma: " + st);
259  }
260  }
261  } catch (FileNotFoundException fnfe) {
262  fnfe.printStackTrace();
263  } catch (IOException ioe) {
264  ioe.printStackTrace();
265  }
266  }
267  return stats;
268  }
269 
270  private String stripPrefix(String path, String prefix) {
271  return path.replaceFirst("^" + prefix.replace("\\", "\\\\")
272  + File.separator.replace("\\", "\\\\"), "");
273 
274  }
275 
281  @Override
282  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
283  try {
284  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
285  IsUsableFor.add(cl);
286  } catch (ClassNotFoundException ex) {
287  report.addException(ex, "Usable class not found.");
288  }
289  return IsUsableFor;
290  }
291 
292  public void addWhiteListString(String s) {
293  whitelist.add(s);
294  }
295 
296  public void addFileEndingWhiteListString(String s) {
297  fileendingwhitelist.add(s);
298  }
299 
300  private String getFileExtension(File f) {
301  String extension = "";
302  String fileName = f.getName();
303  int i = fileName.lastIndexOf('.');
304  int p = Math.max(fileName.lastIndexOf('/'), fileName.lastIndexOf('\\'));
305 
306  if (i > p) {
307  extension = fileName.substring(i + 1);
308  }
309  return extension;
310  }
311 
316  @Override
317  public String getDescription() {
318  String description = "This class is a validator for Coma file references;"
319  + " it checks Exmaralda coma file for file references if a referenced "
320  + "file does not exist, issues a warning;";
321  return description;
322  }
323 
324  @Override
325  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException {
326  Report stats;
327  cd = c.getComaData();
328  stats = function(cd, fix);
329  return stats;
330  }
331 
332 
333 }
CommandLine handleCommandLine(String[] args, List< Option > extraOptions)
void addCritical(String description)
Definition: Report.java:104
static String InputStream2String(InputStream is)
void addCorrect(String statId, String description)
Definition: Report.java:217
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)
Definition: Report.java:287
Collection< Class<?extends CorpusData > > getIsUsableFor()