corpus-services  1.0
CorpusMagician.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora;
2 
57 import java.io.File;
58 import java.io.FileInputStream;
59 import java.io.FileNotFoundException;
60 import java.io.IOException;
61 import java.io.UnsupportedEncodingException;
62 import java.net.MalformedURLException;
63 import java.net.URISyntaxException;
64 import java.net.URL;
65 import java.util.ArrayList;
66 import java.util.Collection;
67 import java.nio.file.Paths;
68 import java.util.Collections;
69 import org.apache.commons.cli.CommandLine;
70 import org.apache.commons.cli.CommandLineParser;
71 import org.apache.commons.cli.DefaultParser;
72 import org.apache.commons.cli.HelpFormatter;
73 import org.apache.commons.cli.Option;
74 import org.apache.commons.cli.Options;
75 import org.apache.commons.cli.ParseException;
76 import java.util.Iterator;
77 import java.util.List;
78 import java.util.Properties;
79 import javax.xml.parsers.ParserConfigurationException;
80 import javax.xml.transform.TransformerException;
81 import javax.xml.xpath.XPathExpressionException;
82 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
83 import org.jdom.Document;
84 import org.jdom.JDOMException;
85 import org.xml.sax.SAXException;
86 
93 public class CorpusMagician {
94 
95  //the whole corpus I want to run checks on
96  static Corpus corpus;
97  //a collection of unordered files I want to run checks on
98  Collection<CorpusData> cdc;
99  //a single file I want to run checks on
100  CorpusData corpusData;
101  //Basedirectory if it exists
102  static URL basedirectory;
103  //all functions there are in the code
104  static Collection<String> allExistingCFs;
105  //all functions that should be run
106  static Collection<String> chosencorpusfunctions = new ArrayList<String>();
107  static Collection<CorpusFunction> corpusfunctions = new ArrayList<CorpusFunction>();
108  //need to have Map or something for this
109  static Collection<Class<? extends CorpusData>> neededcorpusdatatypes = new ArrayList<Class<? extends CorpusData>>();
110  //the final Report
111  static Report report = new Report();
112  //a list of all the available corpus data (no java objects, just URLs)
113  static ArrayList<URL> alldata = new ArrayList<URL>();
114  static CorpusIO cio = new CorpusIO();
115  static boolean fixing = false;
116  static boolean iserrorsonly = false;
117  static boolean isfixesjson = false;
118  static CommandLine cmd = null;
119  //the final Exmaralda error list
120  public static ExmaErrorList exmaError = new ExmaErrorList();
121  static Properties cfProperties = new Properties();
122  static PrettyPrinter pp = new PrettyPrinter();
123  static String settingsfilepath = "settings.xml";
124  //Properties Key Names
125  static String fsm = "fsm";
126  static String segmentation = "segmentation";
127  static String lang = "lang";
128  static String spelllang = "spelllang";
129  static String corpusname = "corpusname";
130  static String kml = "kml";
131  static String mode = "mode";
132  static URL reportlocation;
133  static URL inputurl;
134  static boolean isCorpus = false;
135  static boolean isCollection = false;
136 
137  public CorpusMagician() {
138  }
139 
140  //TODO we need a webservice for this functionality too
141  //in the future (for repo and external users)
142  public static void main(String[] args) {
143 
144  //first args needs to be the URL
145  //check if it's a filepath, we could just convert it to an url
146  System.out.println("CorpusMagician is now doing its magic.");
147  CorpusMagician corpuma = new CorpusMagician();
148  try {
149  //create the options for the commandline
150  createCommandLineOptions(args);
151  //read the options specified on the commandline
153  //convert strings from commandline to corpusfunction objects
154  corpusfunctions = corpusFunctionStrings2Classes(chosencorpusfunctions);
155  //find out which files the chosencorpusfunctions need as input
156  for (CorpusFunction cf : corpusfunctions) {
157  for (Class<? extends CorpusData> cecd : cf.getIsUsableFor()) {
158  if (!neededcorpusdatatypes.contains(cecd)) {
159  neededcorpusdatatypes.add(cecd);
160  }
161  }
162  }
163  //the input can be a filepath or an url pointing to a file or a folder
164  //if the input is a coma file we have a structured corpus
165  //if it is a folder or another corpus file we don't
166  //we can maybe minmize the heapspace when having a structured corpus
167  //we only want to have the data as objects that will be really needed in the functions
168  corpuma.initDataWithURL(inputurl, neededcorpusdatatypes);
169  //We can only init an corpus object if we know it's a structured corpus
170  //now all chosen functions must be run
171  //if we have the coma file, we just give Coma as Input and the Functions need to take care of using the
172  //iterating function
173  report = corpuma.runChosencorpusfunctions();
174  createReports();
175  } catch (MalformedURLException ex) {
176  report.addException(ex, "The given URL was incorrect");
177  } catch (IOException ex) {
178  report.addException(ex, "A file could not be read");
179  } catch (ParserConfigurationException ex) {
180  report.addException(ex, "A file could not be parsed");
181  } catch (TransformerException ex) {
182  report.addException(ex, "A transformation error occured");
183  } catch (SAXException ex) {
184  report.addException(ex, "An XSLT error occured");
185  } catch (JexmaraldaException ex) {
186  report.addException(ex, "An Exmaralda file reading error occured");
187  } catch (URISyntaxException ex) {
188  report.addException(ex, "A URI was incorrect");
189  } catch (XPathExpressionException ex) {
190  report.addException(ex, "An Xpath expression was incorrect");
191  } catch (ClassNotFoundException ex) {
192  report.addException(ex, "Class not found");
193  } catch (JDOMException ex) {
194  report.addException(ex, "JDOM error");
195  }
196 
197  }
198 
199 //Give it a path to a parameters file that tells you
200 //which functions with which parameters should be
201 //run on which files
202  public void readConfig(URL url) {
203  //this depends on how this file will be structured
204  }
205 
206  //this one can write a configfile with the workflow in the
207  //selected format
208  public void writeConfig(URL url) {
209  //needs to have more params
210  //this depends on how this file will be structured
211  }
212 
214  allExistingCFs.add(cf.getClass().getName());
215  }
216 
217  //creates a corpus object from an URL (filepath or "real" url)
218  //we need to make a difference between an unsorted folder, a miscellaneous file or a Coma file which represents a complete folder structure of the corpus
219  public void initDataWithURL(URL url, Collection<Class<? extends CorpusData>> clcds) throws MalformedURLException, SAXException, JexmaraldaException, URISyntaxException, IOException, ClassNotFoundException, JDOMException {
220  if (cio.isDirectory(url)) {
221  //TODO
222  //only read the filetypes from clcds!
223  cdc = cio.read(url, clcds);
224  basedirectory = url;
225  isCollection = true;
226  } else {
227  CorpusData cdata = cio.readFileURL(url);
228  //get the basedirectory
229  basedirectory = cdata.getParentURL();
230  //it could be a ComaFile if it is a Metadata file
231  if (cdata instanceof ComaData) {
232  //if it is we set the boolean
233  isCorpus = true;
234  System.out.println("It's a corpus");
235  //TODO
236  //only read the filetypes from clcds!
237  corpus = new Corpus((ComaData) cdata, clcds);
238  //otherwise it is a single file I want to check
239  } else {
240  corpusData = cdata;
241  }
242  }
243  }
244 
245  //creates a list of all the available data from an url (being a file oder directory)
246  public Collection<URL> createListofData(URL url) throws URISyntaxException, IOException {
247  //add just that url if its a file
248  //adds the urls recursively if its a directory
249  return cio.URLtoList(url);
250  }
251 
252  //checks which functions exist in the code by checking for implementations of the corpus function interface
253  //this shows that it doesn't work to just check for implementations of corpus functions
254  //probably need to check for implementations of CorpusFunction?
255  //TODO
256  public static Collection<String> getAllExistingCFs() {
257  allExistingCFs = new ArrayList<String>();
258  allExistingCFs.add("ComaApostropheChecker");
259  allExistingCFs.add("ComaNSLinksChecker");
260  allExistingCFs.add("ComaOverviewGeneration");
261  allExistingCFs.add("ComaChartsGeneration");
262  allExistingCFs.add("ZipCorpus");
263  allExistingCFs.add("HandlePidRegistration");
264  allExistingCFs.add("RemoveUnlinkedFiles");
265  allExistingCFs.add("ComaSegmentCountChecker");
266  allExistingCFs.add("ExbFileReferenceChecker");
267  allExistingCFs.add("ExbFileCoverageChecker");
268  allExistingCFs.add("ExbAnnotationPanelCheck");
269  allExistingCFs.add("EXB2INELISOTEI");
270  allExistingCFs.add("EXB2HIATISOTEI");
271  allExistingCFs.add("ExbStructureChecker");
272  allExistingCFs.add("ComaFileCoverageChecker");
273  allExistingCFs.add("NormalizeEXB");
274  allExistingCFs.add("PrettyPrintData");
275  allExistingCFs.add("RemoveAbsolutePaths");
276  allExistingCFs.add("RemoveAutoSaveExb");
277  allExistingCFs.add("XSLTChecker");
278  allExistingCFs.add("ComaAddTiersFromExbsCorrector");
279  allExistingCFs.add("ComaXsdChecker");
280  allExistingCFs.add("NgexmaraldaCorpusChecker");
281  allExistingCFs.add("FilenameChecker");
282  allExistingCFs.add("CmdiChecker");
283  allExistingCFs.add("ComaTiersDescriptionAnnotationPanelChecker");
284  allExistingCFs.add("ExbTierDisplayNameChecker");
285  allExistingCFs.add("NgTierCheckerWithAnnotation");
286  //allExistingCFs.add("XsltCheckerInel");
287  allExistingCFs.add("GenerateAnnotationPanel");
288  allExistingCFs.add("CorpusDataRegexReplacer");
289  allExistingCFs.add("ScoreHTML");
290  allExistingCFs.add("HScoreHTML");
291  allExistingCFs.add("CorpusHTML");
292  allExistingCFs.add("IAAFunctionality");
293  allExistingCFs.add("ListHTML");
294  allExistingCFs.add("ExbEventLinebreaksChecker");
295  allExistingCFs.add("MakeTimelineConsistent");
296  allExistingCFs.add("ExbSegmentationChecker");
297  allExistingCFs.add("CalculateAnnotatedTime");
298  allExistingCFs.add("AddCSVMetadataToComa");
299  allExistingCFs.add("ComaKmlForLocations");
300  allExistingCFs.add("RemoveEmptyEvents");
301  allExistingCFs.add("ComaTranscriptionsNameChecker");
302  allExistingCFs.add("ComaTierOverviewCreator");
303  allExistingCFs.add("GeneralTransformer");
304  allExistingCFs.add("ComaFedoraIdentifierLengthChecker");
305  allExistingCFs.add("ExbMP3Next2WavAdder");
306  allExistingCFs.add("ExbRefTierChecker");
307  allExistingCFs.add("ReportStatistics");
308  allExistingCFs.add("ExbSegmenter");
309  allExistingCFs.add("ExbScriptMixChecker");
310  allExistingCFs.add("DuplicateTierContentChecker");
311  allExistingCFs.add("LanguageToolChecker");
312  allExistingCFs.add("VikusViewer");
313  Collections.sort((List<String>) allExistingCFs);
314  return allExistingCFs;
315  }
316 
317  public static String getAllExistingCFsAsString() {
318  String all = "";
319  for (Iterator<String> it = getAllExistingCFs().iterator(); it.hasNext();) {
320  String s = it.next();
321  all = all + "\n" + s;
322  }
323  return all;
324  }
325 
326  public static Collection<CorpusFunction> getAllExistingCFsAsCFs() {
327 
329  }
330 
331  //TODO checks which functions can be run on specified data
332  public Collection<CorpusFunction> getUsableFunctions(CorpusData cd) {
333  //cf.IsUsableFor();
334  //some switch or if else statements for the possible java objects
335  //and a list(?) which function can be apllied to what/which functions exist?
336  Collection<CorpusFunction> usablecorpusfunctions = null;
337  return usablecorpusfunctions;
338  }
339 
340  //TODO return default functions, this is a list that needs to be somewhere
341  //or maybe its an option a corpusfunction can have?
342  public Collection<CorpusFunction> getDefaultUsableFunctions() {
343  Collection<CorpusFunction> defaultcorpusfunctions = null;
344  return defaultcorpusfunctions;
345  }
346 
347  //TODO a dialog to choose functions you want to apply
348  public Collection<String> chooseFunctionDialog() {
349  chosencorpusfunctions = null;
350  //add the chosen Functions
351  return chosencorpusfunctions;
352  }
353 
354  public static Collection<CorpusFunction> corpusFunctionStrings2Classes(Collection<String> corpusfunctionstrings) {
355  Collection<CorpusFunction> cf2strcorpusfunctions = new ArrayList<CorpusFunction>();
356  for (String function : corpusfunctionstrings) {
357  switch (function.toLowerCase()) {
358  case "comaapostrophechecker":
360  cf2strcorpusfunctions.add(cac);
361  break;
362  case "comanslinkschecker":
364  cf2strcorpusfunctions.add(cnslc);
365  break;
366  case "comaoverviewgeneration":
368  if (cfProperties != null) {
369  // Pass on the configuration parameter
370  if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).toLowerCase().equals("inel")) {
371  cog.setInel();
372  System.out.println("Mode set to inel");
373  }
374  }
375  cf2strcorpusfunctions.add(cog);
376  break;
377  case "comachartsgeneration":
378  ComaChartsGeneration coc = new ComaChartsGeneration();if (cfProperties != null) {
379  // Pass on the configuration parameter
380  if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).toLowerCase().equals("inel")) {
381  coc.setInel();
382  System.out.println("Mode set to inel");
383  }
384  }
385  cf2strcorpusfunctions.add(coc);
386  break;
387  case "comasegmentcountchecker":
389  cf2strcorpusfunctions.add(cscc);
390  break;
391  case "exbfilereferencechecker":
393  cf2strcorpusfunctions.add(efrc);
394  break;
395  case "exbfilecoveragechecker":
397  cf2strcorpusfunctions.add(efcc);
398  break;
399  case "exbannotationpanelcheck":
401  cf2strcorpusfunctions.add(eapc);
402  break;
403  case "comafilecoveragechecker":
405  if (cfProperties != null) {
406  // Pass on the configuration parameter
407  if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).equals("inel")) {
408  fcc.addFileEndingWhiteListString("flextext");
409  fcc.addWhiteListString("report-output.html");
410  fcc.addWhiteListString("Segmentation_Errors.xml");
411  fcc.addWhiteListString("Structure_Errors.xml");
412  System.out.println("Mode set to inel");
413  }
414  }
415  cf2strcorpusfunctions.add(fcc);
416  break;
417  case "prettyprintdata":
418  PrettyPrintData pd = new PrettyPrintData();
419  cf2strcorpusfunctions.add(pd);
420  break;
421  case "removeabsolutepaths":
423  cf2strcorpusfunctions.add(rap);
424  break;
425  case "removeautosaveexb":
427  cf2strcorpusfunctions.add(rase);
428  break;
429  case "xsltchecker":
430  XSLTChecker xc = new XSLTChecker();
431  if (cfProperties != null) {
432  // Pass on the configuration parameter
433  if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).toLowerCase().equals("inel")) {
434  xc.setXSLresource("/xsl/inel-checks.xsl");
435  System.out.println("Mode set to inel");
436  }
437  if (cfProperties.containsKey(fsm)) {
438  xc.setFSMpath(cfProperties.getProperty(fsm));
439  System.out.println("FSM set to " + cfProperties.getProperty(fsm));
440  }
441  }
442  cf2strcorpusfunctions.add(xc);
443  break;
444  case "comaaddtiersfromexbscorrector":
446  cf2strcorpusfunctions.add(catfec);
447  break;
448  case "comaxsdchecker":
449  ComaXsdChecker cxsd = new ComaXsdChecker();
450  cf2strcorpusfunctions.add(cxsd);
451  break;
452  case "ngexmaraldacorpuschecker":
454  cf2strcorpusfunctions.add(ngex);
455  break;
456  case "filenamechecker":
458  cf2strcorpusfunctions.add(fnc);
459  break;
460  case "cmdichecker":
461  CmdiChecker cmdi = new CmdiChecker();
462  cf2strcorpusfunctions.add(cmdi);
463  break;
464  case "comafedoraidentifierlengthchecker":
466  cf2strcorpusfunctions.add(cplc);
467  break;
468  case "comatranscriptionsnamechecker":
470  cf2strcorpusfunctions.add(cnc);
471  break;
472  case "comatiersdescriptionannotationpanelchecker":
474  cf2strcorpusfunctions.add(tcwa);
475  break;
476  case "exbtierdisplaynamechecker":
478  cf2strcorpusfunctions.add(tc);
479  break;
480  case "ngtiercheckerwithannotation":
482  cf2strcorpusfunctions.add(ngtcwa);
483  break;
484  case "exb2inelisotei":
485  EXB2HIATISOTEI eiit = new EXB2HIATISOTEI();
486  eiit.setInel();
487  if (cfProperties != null) {
488  // Pass on the configuration parameter
489  if (cfProperties.containsKey(lang)) {
490  eiit.setLanguage(cfProperties.getProperty(lang));
491  System.out.println("Language set to " + cfProperties.getProperty(lang));
492  }
493  if (cfProperties.containsKey(fsm)) {
494  eiit.setFSM(cfProperties.getProperty(fsm));
495  System.out.println("FSM set to " + cfProperties.getProperty(fsm));
496  }
497  }
498  cf2strcorpusfunctions.add(eiit);
499  break;
500  //Maybe get rid of those special cases too!
501  case "exb2inelisoteisel":
502  EXB2HIATISOTEI eiitsel = new EXB2HIATISOTEI();
503  eiitsel.setInel();
504  if (cfProperties.containsKey(fsm)) {
505  eiitsel.setFSM(cfProperties.getProperty(fsm));
506  System.out.println("FSM set to " + cfProperties.getProperty(fsm));
507  }
508  eiitsel.setLanguage("sel");
509  cf2strcorpusfunctions.add(eiitsel);
510  break;
511  case "exb2inelisoteidlg":
512  EXB2HIATISOTEI eiitdlg = new EXB2HIATISOTEI();
513  eiitdlg.setInel();
514  if (cfProperties.containsKey(fsm)) {
515  eiitdlg.setFSM(cfProperties.getProperty(fsm));
516  System.out.println("FSM set to " + cfProperties.getProperty(fsm));
517  }
518  eiitdlg.setLanguage("dlg");
519  cf2strcorpusfunctions.add(eiitdlg);
520  break;
521  case "exb2inelisoteixas":
522  EXB2HIATISOTEI eiitxas = new EXB2HIATISOTEI();
523  eiitxas.setInel();
524  if (cfProperties.containsKey(fsm)) {
525  eiitxas.setFSM(cfProperties.getProperty(fsm));
526  System.out.println("FSM set to " + cfProperties.getProperty(fsm));
527  }
528  eiitxas.setLanguage("xas");
529  cf2strcorpusfunctions.add(eiitxas);
530  break;
531  case "exb2hiatisotei":
532  EXB2HIATISOTEI ehit = new EXB2HIATISOTEI();
533  if (cfProperties != null) {
534  // Pass on the configuration parameter
535  if (cfProperties.containsKey(lang)) {
536  ehit.setLanguage(cfProperties.getProperty(lang));
537  System.out.println("Language set to " + cfProperties.getProperty(lang));
538  }
539  if (cfProperties.containsKey(mode)) {
540  if (cfProperties.getProperty(mode).toLowerCase().equals("inel")) {
541  ehit.setInel();
542  System.out.println("Mode set to inel");
543  } else if (cfProperties.getProperty(mode).toLowerCase().equals("token")) {
544  ehit.setToken();
545  System.out.println("Mode set to token");
546  }
547  }
548  if (cfProperties.containsKey(fsm)) {
549  ehit.setFSM(cfProperties.getProperty(fsm));
550  System.out.println("FSM set to " + cfProperties.getProperty(fsm));
551  }
552  }
553  cf2strcorpusfunctions.add(ehit);
554  break;
555  case "normalizeexb":
556  ExbNormalize ne = new ExbNormalize();
557  if (cfProperties != null) {
558  // Pass on the configuration parameter
559  if (cfProperties.containsKey("whitespace")) {
560  ne.setfixWhiteSpaces(cfProperties.getProperty("whitespace"));
561  System.out.println("FixWhitespace set to " + cfProperties.getProperty("whitespace"));
562  }
563  }
564  cf2strcorpusfunctions.add(ne);
565  break;
566  case "generateannotationpanel":
568  cf2strcorpusfunctions.add(gap);
569  break;
570  case "iaafunctionality":
572  cf2strcorpusfunctions.add(iaa);
573  break;
574  case "comakmlforlocations":
576  if (cfProperties != null) {
577  // Pass on the configuration parameter
578  if (cfProperties.containsKey(kml)) {
579  ckml.setKMLFilePath(cfProperties.getProperty(kml));
580  System.out.println("KML file path set to " + cfProperties.getProperty(kml));
581  }
582  }
583  cf2strcorpusfunctions.add(ckml);
584  break;
585  case "reportstatistics":
587  cf2strcorpusfunctions.add(rs);
588  break;
589  case "corpusdataregexreplacer":
590  //ToDo
592  //try custom properties for the different corpusfunctions
593  if (cfProperties != null) {
594  // Pass on the configuration parameter
595  if (cfProperties.containsKey("replace")) {
596  cdrr.setReplace(cfProperties.getProperty("replace"));
597  System.out.println("Replace set to " + cfProperties.getProperty("replace"));
598  }
599  if (cfProperties.containsKey("replacement")) {
600  cdrr.setReplacement(cfProperties.getProperty("replacement"));
601  System.out.println("Replacement set to " + cfProperties.getProperty("replacement"));
602  }
603  if (cfProperties.containsKey("xpathcontext")) {
604  cdrr.setXpathContext(cfProperties.getProperty("xpathcontext"));
605  System.out.println("Xpath set to " + cfProperties.getProperty("xpathcontext"));
606  }
607  if (cfProperties.containsKey("coma")) {
608  cdrr.setComa(cfProperties.getProperty("coma"));
609  System.out.println("Replace in Coma set to " + cfProperties.getProperty("coma"));
610  }
611  }
612  cf2strcorpusfunctions.add(cdrr);
613  break;
614  case "zipcorpus":
615  ZipCorpus zc = new ZipCorpus();
616  if (cfProperties != null) {
617  // Pass on the configuration parameter
618  if (cfProperties.containsKey("source_folder")) {
619  zc.setSourceFolder(cfProperties.getProperty("source_folder"));
620  System.out.println("Location of source folder set to " + cfProperties.getProperty("source_folder"));
621  }
622  if (cfProperties.containsKey("output_zip_file")) {
623  zc.setOutputFile(cfProperties.getProperty("output_zip_file"));
624  System.out.println("Location of output file set to " + cfProperties.getProperty("output_zip_file"));
625  }
626  if (cfProperties.containsKey("audio")) {
627  zc.setWithAudio(cfProperties.getProperty("audio"));
628  System.out.println("Should contain audio set to " + cfProperties.getProperty("audio"));
629  }
630  }
631  cf2strcorpusfunctions.add(zc);
632  break;
633  case "handlepidregistration":
635  if (cfProperties != null) {
636  // Pass on the configuration parameter
637  if (cfProperties.containsKey("user")) {
638  hppr.setUser(cfProperties.getProperty("user"));
639  System.out.println("User set to " + cfProperties.getProperty("user"));
640  }
641  if (cfProperties.containsKey("pass")) {
642  hppr.setPass(cfProperties.getProperty("pass"));
643  System.out.println("Password set to " + cfProperties.getProperty("pass").replaceAll(".", "*"));
644  //System.out.println("Password set to " + cfProperties.getProperty("pass"));
645  }
646  if (cfProperties.containsKey("prefix")) {
647  hppr.setHandlePrefix(cfProperties.getProperty("prefix"));
648  System.out.println("Prefix set to " + cfProperties.getProperty("prefix"));
649  }
650  }
651  cf2strcorpusfunctions.add(hppr);
652  break;
653  case "removeunlinkedfiles":
655  cf2strcorpusfunctions.add(ruf);
656  break;
657  case "scorehtml":
658  ScoreHTML shtml = new ScoreHTML();
659  if (cfProperties != null) {
660  if (cfProperties.containsKey(corpusname)) {
661  shtml.setCorpusName(cfProperties.getProperty(corpusname));
662  System.out.println("Corpus name set to " + cfProperties.getProperty(corpusname));
663  }
664  }
665  cf2strcorpusfunctions.add(shtml);
666  break;
667  case "hscorehtml":
668  HScoreHTML hshtml = new HScoreHTML();
669  cf2strcorpusfunctions.add(hshtml);
670  break;
671  case "corpushtml":
672  CorpusHTML chtml = new CorpusHTML();
673  cf2strcorpusfunctions.add(chtml);
674  break;
675  case "listhtml":
676  ListHTML lhtml = new ListHTML();
677  if (cfProperties != null) {
678  // Pass on the configuration parameter
679  if (cfProperties.containsKey(segmentation)) {
680  lhtml.setSegmentation(cfProperties.getProperty(segmentation));
681  System.out.println("Segmentation set to " + cfProperties.getProperty(segmentation));
682  }
683  if (cfProperties.containsKey(corpusname)) {
684  lhtml.setCorpusName(cfProperties.getProperty(corpusname));
685  System.out.println("Corpus name set to " + cfProperties.getProperty(corpusname));
686  }
687  if (cfProperties.containsKey(fsm)) {
688  lhtml.setExternalFSM(cfProperties.getProperty(fsm));
689  System.out.println("External FSM path set to " + cfProperties.getProperty(fsm));
690  }
691  }
692  cf2strcorpusfunctions.add(lhtml);
693  break;
694  case "exbeventlinebreakschecker":
696  cf2strcorpusfunctions.add(elb);
697  break;
698  case "maketimelineconsistent":
700  if (cfProperties != null) {
701  // Pass on the configuration parameter
702  if (cfProperties.containsKey("interpolate")) {
703  emtc.setInterpolateTimeline(cfProperties.getProperty("interpolate"));
704  System.out.println("FixWhitespace set to " + cfProperties.getProperty("interpolate"));
705  }
706  }
707  cf2strcorpusfunctions.add(emtc);
708  break;
709  case "exbstructurechecker":
711  cf2strcorpusfunctions.add(esc);
712  break;
713  case "exbsegmentationchecker":
715  if (cfProperties != null) {
716  // Pass on the configuration parameter
717  if (cfProperties.containsKey(segmentation)) {
718  eseg.setSegmentation(cfProperties.getProperty(segmentation));
719  System.out.println("Segmentation set to " + cfProperties.getProperty(segmentation));
720  }
721  if (cfProperties.containsKey(fsm)) {
722  eseg.setExternalFSM(cfProperties.getProperty(fsm));
723  System.out.println("External FSM path set to " + cfProperties.getProperty(fsm));
724  }
725  }
726  cf2strcorpusfunctions.add(eseg);
727  break;
728  case "exbsegmenter":
730  if (cfProperties != null) {
731  // Pass on the configuration parameter
732  if (cfProperties.containsKey(segmentation)) {
733  esegr.setSegmentation(cfProperties.getProperty(segmentation));
734  System.out.println("Segmentation set to " + cfProperties.getProperty(segmentation));
735  }
736  if (cfProperties.containsKey(fsm)) {
737  esegr.setExternalFSM(cfProperties.getProperty(fsm));
738  System.out.println("External FSM path set to " + cfProperties.getProperty(fsm));
739  }
740  }
741  cf2strcorpusfunctions.add(esegr);
742  break;
743  case "calculateannotatedtime":
745  cf2strcorpusfunctions.add(cat);
746  break;
747  case "addcsvmetadatatocoma":
749  if (cfProperties != null) {
750  // Pass on the configuration parameter
751  if (cfProperties.containsKey("csv")) {
752  acmtc.setCSVFilePath(cfProperties.getProperty("csv"));
753  System.out.println("CSV file path set to " + cfProperties.getProperty("csv"));
754  }
755  if (cfProperties.containsKey("speaker")) {
756  acmtc.setSpeakerOrCommunication(cfProperties.getProperty("speaker"));
757  System.out.println("CSV file set for " + cfProperties.getProperty("speaker"));
758  }
759  }
760  cf2strcorpusfunctions.add(acmtc);
761  break;
762  case "removeemptyevents":
764  cf2strcorpusfunctions.add(ree);
765  break;
766  case "comatieroverviewcreator":
768  cf2strcorpusfunctions.add(ctoc);
769  break;
770  case "generaltransformer":
772  if (cfProperties != null) {
773  if (cfProperties.containsKey("coma")) {
774  gt.setComa(cfProperties.getProperty("coma"));
775  System.out.println("Run on Coma set to " + cfProperties.getProperty("coma"));
776  }
777  if (cfProperties.containsKey("exb")) {
778  gt.setExb(cfProperties.getProperty("exb"));
779  System.out.println("Run on exb set to " + cfProperties.getProperty("exb"));
780  }
781  if (cfProperties.containsKey("exs")) {
782  gt.setExs(cfProperties.getProperty("exs"));
783  System.out.println("Run on exs set to " + cfProperties.getProperty("exs"));
784  }
785  if (cfProperties.containsKey("xsl")) {
786  gt.setPathToXSL(cfProperties.getProperty("xsl"));
787  System.out.println("Path to XSL set to " + cfProperties.getProperty("xsl"));
788  }
789  if (cfProperties.containsKey("overwritefiles")) {
790  gt.setOverwriteFiles(cfProperties.getProperty("overwritefiles"));
791  System.out.println("overwritefiles set to " + cfProperties.getProperty("overwritefiles"));
792  }
793  }
794  cf2strcorpusfunctions.add(gt);
795  break;
796  case "exbmp3next2wavadder":
798  cf2strcorpusfunctions.add(emn2wa);
799  break;
800  case "exbreftierchecker":
802  cf2strcorpusfunctions.add(ertc);
803  break;
804  case "exbscriptmixchecker":
806  cf2strcorpusfunctions.add(esmc);
807  break;
808  case "duplicatetiercontentchecker":
810  cf2strcorpusfunctions.add(duplc);
811  if (cfProperties != null) {
812  // Pass on the configuration parameter
813  if (cfProperties.containsKey("tiers")) {
814  duplc.setTierNames(cfProperties.getProperty("tiers"));
815  System.out.println("Tier names set to " + cfProperties.getProperty("tiers"));
816  }
817  }
818  break;
819  case "languagetoolchecker":
821  if (cfProperties != null) {
822  // Pass on the configuration parameter
823  if (cfProperties.containsKey(spelllang)) {
824  ltc.setLanguage(cfProperties.getProperty(spelllang));
825  System.out.println("Language set to " + cfProperties.getProperty(spelllang));
826  }
827  if (cfProperties.containsKey("tier")) {
828  ltc.setTierToCheck(cfProperties.getProperty("tier"));
829  System.out.println("Tier to check set to " + cfProperties.getProperty("tier"));
830  }
831  }
832  cf2strcorpusfunctions.add(ltc);
833  break;
834  case "vikusviewer":
835  VikusViewer vv = new VikusViewer();
836  cf2strcorpusfunctions.add(vv);
837  break;
838  default:
839  report.addCritical("CommandlineFunctionality", "Function String \"" + function + "\" is not recognized");
840  }
841  }
842  return cf2strcorpusfunctions;
843  }
844 
845  //run the chosen functions on the chosen corpus data
846  Report runChosencorpusfunctions() {
847  //it's an unordered Collection of corpus data
848  if (isCollection) {
849  for (CorpusFunction function : corpusfunctions) {
850  if (fixing) {
851  report.merge(runCorpusFunction(cdc, function, true));
852  } else {
853  report.merge(runCorpusFunction(cdc, function));
854  }
855  }
856  //Congrats - It's a corpus!
857  } else if (isCorpus) {
858  for (CorpusFunction function : corpusfunctions) {
859  if (fixing) {
860  report.merge(runCorpusFunction(corpus, function, true));
861  } else {
862  report.merge(runCorpusFunction(corpus, function));
863  }
864  }
865  //must be a single file then
866  } else {
867  for (CorpusFunction function : corpusfunctions) {
868  if (fixing) {
869  report.merge(runCorpusFunction(corpusData, function, true));
870  } else {
871  report.merge(runCorpusFunction(corpusData, function));
872  }
873  }
874  }
875 
876  return report;
877  }
878  //run multiple functions on a corpus, that means all the files in the corpus
879  //the function can run on
880 
881  public Report runCorpusFunctions(Corpus c, Collection<CorpusFunction> cfc) {
882  Report report = new Report();
883  for (CorpusFunction cf : cfc) {
884  Report newReport = runCorpusFunction(c, cf);
885  report.merge(newReport);
886  }
887  return report;
888  }
889 
890  //run multiple functions on the set corpus, that means all the files in the corpus
891  //the function can run on
892  public Report runCorpusFunctions(Collection<CorpusFunction> cfc) {
893  return runCorpusFunctions(corpus, cfc);
894  }
895 
896  //run one function on a corpus, that means all the files in the corpus
897  //the funciton can run on
899  return runCorpusFunction(c, cf, false);
900  }
901 
902  //run one function on a corpus, that means all the files in the corpus
903  //the funciton can run on
904  public Report runCorpusFunction(Corpus c, CorpusFunction cf, boolean fix) {
905  Report report = new Report();
906  //find out on which objects this corpus function can run
907  //choose those from the corpus
908  //and run the checks on those files recursively
909  Collection<Class<? extends CorpusData>> usableTypes = cf.getIsUsableFor();
910 
911  //if the corpus files are an instance
912  //of the class cl, run the function
913  for (CorpusData cd : c.getCorpusData()) {
914  if (usableTypes.contains(cd.getClass())) {
915  Report newReport = runCorpusFunction(cd, cf, fix);
916  report.merge(newReport);
917  }
918 
919  }
920  return report;
921  }
922 
923  //run one function on a corpus, that means all the files in the corpus
924  //the function can run on
926  return runCorpusFunction(corpus, cf, false);
927  }
928 
929  //run one function on a corpus, that means all the files in the corpus
930  //the funciton can run on
931  public Report runCorpusFunction(Collection<CorpusData> cdc, CorpusFunction cf, boolean fix) {
932  Report report = new Report();
933  //find out on which objects this corpus function can run
934  //choose those from the corpus
935  //and run the checks on those files recursively
936  Collection<Class<? extends CorpusData>> usableTypes = cf.getIsUsableFor();
937  //if the corpus files are an instance
938  //of the class cl, run the function
939  for (CorpusData cd : cdc) {
940  if (usableTypes.contains(cd.getClass())) {
941  Report newReport = runCorpusFunction(cd, cf, fix);
942  report.merge(newReport);
943  }
944  }
945  return report;
946  }
947 
948  //run one function on a corpus, that means all the files in the corpus
949  //the funciton can run on
950  public Report runCorpusFunction(Collection<CorpusData> cdc, CorpusFunction cf) {
951  return runCorpusFunction(cdc, cf, false);
952  }
953 
955  return cf.execute(cd);
956  }
957 
958  public Report runCorpusFunction(CorpusData cd, CorpusFunction cf, boolean fix) {
959  return cf.execute(cd, fix);
960  }
961 
962  public static Report runCorpusFunctions(CorpusData cd, Collection<CorpusFunction> cfc) {
963  Report report = new Report();
964  for (CorpusFunction cf : cfc) {
965  Report newReport = (cf.execute(cd));
966  report.merge(newReport);
967  }
968  return report;
969  }
970 
971  public static Report runCorpusFunctions(CorpusData cd, Collection<CorpusFunction> cfc, boolean fix) {
972  Report report = new Report();
973  for (CorpusFunction cf : cfc) {
974  Report newReport = (cf.execute(cd, fix));
975  report.merge(newReport);
976  }
977  return report;
978  }
979 
980  //TODO
981  //to save individual corpusparameters in a file
982  //and maybe also save the functions todos there
983  public void readParameters() {
984  //read the XML file as variables
985  }
986 
987  public void setCorpusData(CorpusData corpusData) {
988  this.corpusData = corpusData;
989  }
990 
991  public void setChosencorpusfunctions(Collection<String> chosencorpusfunctions) {
992  CorpusMagician.chosencorpusfunctions = chosencorpusfunctions;
993  }
994 
995  public Corpus getCorpus() {
996  return corpus;
997  }
998 
1000  return corpusData;
1001  }
1002 
1003  public Collection<String> getChosencorpusfunctions() {
1004  return chosencorpusfunctions;
1005  }
1006 
1007  public static void createReports() throws IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, SAXException, XPathExpressionException, JDOMException {
1008  System.out.println(report.getFullReports());
1009  String reportOutput;
1010  if (reportlocation.getFile().endsWith("html")) {
1011  if (iserrorsonly) {
1012  //ToDo
1013  //reportOutput = ReportItem.generateDataTableHTML(report.getErrorStatistics(basedirectory), report.getSummaryLines());
1014  reportOutput = ReportItem.generateDataTableHTML(report.getErrorStatistics(), report.getSummaryLines());
1015  } else {
1016  reportOutput = ReportItem.generateDataTableHTML(report.getRawStatistics(), report.getSummaryLines());
1017  }
1018  } else {
1019  //reportOutput = report.getSummaryLines() + "\n" + report.getErrorReports();
1020  reportOutput = report.getSummaryLines() + "\n" + report.getFullReports();
1021  }
1022  String absoluteReport = reportOutput;
1023  if (absoluteReport != null && basedirectory != null && absoluteReport.contains(basedirectory.toString())) {
1024  absoluteReport = reportOutput.replaceAll(basedirectory.toString(), "");
1025  }
1026  if (absoluteReport != null) {
1027  cio.write(absoluteReport, reportlocation);
1028  }
1029  //create the error list file
1030  System.out.println("Basedirectory is " + basedirectory);
1031  System.out.println("BasedirectoryPath is " + basedirectory.getPath());
1032  URL errorlistlocation = new URL(basedirectory + "curation/CorpusServices_Errors.xml");
1033  URL fixJsonlocation = new URL(basedirectory + "curation/fixes.json");
1034  File curationFolder = new File((new URL(basedirectory + "curation").getFile()));
1035  if (!curationFolder.exists()) {
1036  //the curation folder it not there and needs to be created
1037  curationFolder.mkdirs();
1038  }
1040  String exmaErrorListString = TypeConverter.JdomDocument2String(exmaErrorList);
1041  if (exmaErrorListString != null && basedirectory != null && exmaErrorListString.contains(basedirectory.getPath())) {
1042  exmaErrorListString = exmaErrorListString.replaceAll(basedirectory.getPath(), "../");
1043  }
1044  if (exmaErrorListString != null) {
1045  exmaErrorListString = pp.indent(exmaErrorListString, "event");
1046  cio.write(exmaErrorListString, errorlistlocation);
1047  System.out.println("Wrote ErrorList at " + errorlistlocation);
1048  }
1049  if (isfixesjson) {
1050  String fixJson = "";
1051  if (isCorpus) {
1052  fixJson = report.getFixJson(corpus);
1053  } else {
1054  fixJson = report.getFixJson();
1055  }
1056  if (fixJson != null) {
1057  cio.write(fixJson, fixJsonlocation);
1058  System.out.println("Wrote JSON file for fixes at " + fixJsonlocation);
1059  }
1060  }
1061  }
1062 
1063  public static void readCommandLineOptions() throws MalformedURLException {
1064  String urlstring = cmd.getOptionValue("input");
1065  fixing = cmd.hasOption("f");
1066  iserrorsonly = cmd.hasOption("e");
1067  isfixesjson = cmd.hasOption("j");
1068  if (urlstring.startsWith("file://")) {
1069  inputurl = new URL(urlstring);
1070  } else {
1071  inputurl = Paths.get(urlstring).toUri().toURL();
1072  }
1073  //now the place where Report should end up
1074  //also allow normal filepaths and convert them
1075  String reportstring = cmd.getOptionValue("output");
1076  if (reportstring.startsWith("file://")) {
1077  reportlocation = new URL(reportstring);
1078  } else {
1079  reportlocation = Paths.get(reportstring).toUri().toURL();
1080  }
1081  //now add the functionsstrings to array
1082  String[] corpusfunctionarray = cmd.getOptionValues("c");
1083  for (String cf : corpusfunctionarray) {
1084  CorpusMagician.chosencorpusfunctions.add(cf);
1085  }
1086  System.out.println(CorpusMagician.chosencorpusfunctions.toString());
1087  }
1088 
1089  private static void createCommandLineOptions(String[] args) throws FileNotFoundException, IOException {
1090  Options options = new Options();
1091 
1092  Option input = new Option("i", "input", true, "input file path (coma file for corpus, folder or other file for unstructured data)");
1093  input.setRequired(true);
1094  input.setArgName("FILE PATH");
1095  options.addOption(input);
1096 
1097  Option output = new Option("o", "output", true, "output file");
1098  output.setRequired(true);
1099  output.setArgName("FILE PATH");
1100  options.addOption(output);
1101 
1102  Option corpusfunction = new Option("c", "corpusfunction", true, "corpus function");
1103  // Set option c to take 1 to oo arguments
1104  corpusfunction.setArgs(Option.UNLIMITED_VALUES);
1105  corpusfunction.setArgName("CORPUS FUNCTION");
1106  corpusfunction.setRequired(true);
1107  corpusfunction.setValueSeparator(',');
1108  options.addOption(corpusfunction);
1109 
1110  /*
1111  Option speed = new Option("s", "speed", false, "faster but more heap space");
1112  speed.setRequired(false);
1113  options.addOption(speed);
1114  */
1115  Option propertyOption = Option.builder("p")
1116  .longOpt("property")
1117  .argName("property=value")
1118  .hasArgs()
1119  .valueSeparator()
1120  .numberOfArgs(2)
1121  .desc("use value for given properties")
1122  .build();
1123 
1124  options.addOption(propertyOption);
1125 
1126  Option fix = new Option("f", "fix", false, "fixes problems automatically");
1127  fix.setRequired(false);
1128  options.addOption(fix);
1129 
1130  Option help = new Option("h", "help", false, "display help");
1131  fix.setRequired(false);
1132  options.addOption(help);
1133 
1134  Option errorsonly = new Option("e", "errorsonly", false, "output only errors");
1135  fix.setRequired(false);
1136  options.addOption(errorsonly);
1137 
1138  Option fixesjson = new Option("j", "fixesjson", false, "output json file for fixes");
1139  fix.setRequired(false);
1140  options.addOption(fixesjson);
1141 
1142  Option settingsfile = new Option("s", "settingsfile", true, "settings file path");
1143  settingsfile.setRequired(false);
1144  settingsfile.setArgName("FILE PATH");
1145  options.addOption(settingsfile);
1146 
1147  CommandLineParser parser = new DefaultParser();
1148  HelpFormatter formatter = new HelpFormatter();
1149  formatter.setOptionComparator(null);
1150 
1151  String header = "Specify a corpus folder or file and a function to be applied\n\n";
1152  //String footer = "\nthe available functions are:\n" + getAllExistingCFsAsString() + "\n\nPlease report issues at https://lab.multilingua.uni-hamburg.de/redmine/projects/corpus-services/issues";
1153  String footerverbose = "\nthe available functions are:\n" + getAllExistingCFsAsString() + "\n\nDescriptions of the available functions follow:\n\n";
1154  String desc;
1155  String hasfix;
1156  String usable;
1157  for (CorpusFunction cf : getAllExistingCFsAsCFs()) {
1158  desc = cf.getFunction() + ": " + cf.getDescription();
1159  usable = "\nThe function can be used on:\n";
1160  for (Class cl : cf.getIsUsableFor()) {
1161  usable += cl.getSimpleName() + " ";
1162  }
1163  hasfix = "\nThe function has a fixing option: " + cf.getCanFix().toString();
1164  footerverbose += desc + hasfix + usable + "\n\n";
1165  usable = "";
1166  }
1167  footerverbose += "\n\nPlease report issues at https://lab.multilingua.uni-hamburg.de/redmine/projects/corpus-services/issues";
1168  try {
1169  cmd = parser.parse(options, args);
1170  } catch (ParseException e) {
1171  System.out.println(e.getMessage());
1172  formatter.printHelp("hzsk-corpus-services", header, options, footerverbose, true);
1173  System.exit(1);
1174  }
1175 
1176  //TODO
1177  //in reality this never works because there will be an error since the required parameters are missing - but that returns the help as well....
1178  if (cmd.hasOption("h")) {
1179  // automatically generate the help statement
1180  formatter.printHelp("hzsk-corpus-services", header, options, footerverbose, true);
1181  System.exit(1);
1182  }
1183 
1184  if (cmd.hasOption("p")) {
1185  if (cmd.hasOption("s")) {
1186  System.out.println("Options s and p for parameters are not allowed at the same time!!");
1187  formatter.printHelp("hzsk-corpus-services", header, options, footerverbose, true);
1188  System.exit(1);
1189  } else {
1190  cfProperties = cmd.getOptionProperties("p");
1191  }
1192  } else {
1193  if (cmd.hasOption("s")) {
1194  //read filepath
1195  settingsfilepath = cmd.getOptionValue("s");
1196  } else {
1197  //default
1198  settingsfilepath = "settings.param";
1199  }
1200  //also need to allow for not findind the xml settings file here!
1201  if (new File(settingsfilepath).exists()) {
1202  FileInputStream test = new FileInputStream(settingsfilepath);
1203  cfProperties.loadFromXML(test);
1204  System.out.println("Properties are: " + cfProperties);
1205  } else {
1206  System.out.println("No parameters loaded.");
1207  }
1208  }
1209 
1210  //we can save the properties if the input was not from an settings.xml
1211  //cfProperties.storeToXML()
1212  //add function to read properties from file! Needs to be a key value list though not xml
1213  //Reads a property list (key and element pairs) from the input
1214  //Need to use
1215 // * byte stream. The input stream is in a simple line-oriented
1216 // * format as specified in
1217 // * {@link #load(java.io.Reader) load(Reader)} and is assumed to use
1218 // * the ISO 8859-1 character encoding; that is each byte is one Latin1
1219 // * character. Characters not in Latin1, and certain special characters,
1220 // * are represented in keys and elements using Unicode escapes as defined in
1221 // * section 3.3 of
1222 
1223  /*
1224  String inputFilePath = cmd.getOptionValue("input");
1225  String outputFilePath = cmd.getOptionValue("output");
1226 
1227  System.out.println(inputFilePath);
1228  System.out.println(outputFilePath);
1229  */
1230  }
1231 
1232 }
Report runCorpusFunction(CorpusData cd, CorpusFunction cf)
Collection< CorpusFunction > getDefaultUsableFunctions()
Report runCorpusFunction(CorpusFunction cf)
static Collection< String > getAllExistingCFs()
void merge(Report sr)
Definition: Report.java:73
Report runCorpusFunction(Corpus c, CorpusFunction cf)
Collection< CorpusData > getCorpusData()
Definition: Corpus.java:119
static String generateDataTableHTML(Collection< ReportItem > errors, String summarylines)
static Collection< CorpusFunction > getAllExistingCFsAsCFs()
String indent(String xml, String suppressedElements)
static org.jdom.Document W3cDocument2JdomDocument(org.w3c.dom.Document input)
static Report runCorpusFunctions(CorpusData cd, Collection< CorpusFunction > cfc)
Report runCorpusFunction(Corpus c, CorpusFunction cf, boolean fix)
Collection< Class<?extends CorpusData > > getIsUsableFor()
void registerCorpusFunction(CorpusFunction cf)
static Report runCorpusFunctions(CorpusData cd, Collection< CorpusFunction > cfc, boolean fix)
Report runCorpusFunction(CorpusData cd, CorpusFunction cf, boolean fix)
void setCorpusData(CorpusData corpusData)
static Collection< CorpusFunction > corpusFunctionStrings2Classes(Collection< String > corpusfunctionstrings)
void setChosencorpusfunctions(Collection< String > chosencorpusfunctions)
Collection< String > getChosencorpusfunctions()
Report runCorpusFunction(Collection< CorpusData > cdc, CorpusFunction cf)
static String JdomDocument2String(org.jdom.Document jdomDocument)
Collection< CorpusFunction > getUsableFunctions(CorpusData cd)
Report runCorpusFunctions(Collection< CorpusFunction > cfc)
void initDataWithURL(URL url, Collection< Class<?extends CorpusData >> clcds)
Collection< URL > createListofData(URL url)
Report runCorpusFunctions(Corpus c, Collection< CorpusFunction > cfc)
Report runCorpusFunction(Collection< CorpusData > cdc, CorpusFunction cf, boolean fix)
Collection< String > chooseFunctionDialog()