1 package de.uni_hamburg.corpora;
58 import java.io.FileInputStream;
59 import java.io.FileNotFoundException;
60 import java.io.IOException;
61 import java.io.UnsupportedEncodingException;
62 import java.net.MalformedURLException;
63 import java.net.URISyntaxException;
65 import java.util.ArrayList;
66 import java.util.Collection;
67 import java.nio.file.Paths;
68 import java.util.Collections;
69 import org.apache.commons.cli.CommandLine;
70 import org.apache.commons.cli.CommandLineParser;
71 import org.apache.commons.cli.DefaultParser;
72 import org.apache.commons.cli.HelpFormatter;
73 import org.apache.commons.cli.Option;
74 import org.apache.commons.cli.Options;
75 import org.apache.commons.cli.ParseException;
76 import java.util.Iterator;
77 import java.util.List;
78 import java.util.Properties;
79 import javax.xml.parsers.ParserConfigurationException;
80 import javax.xml.transform.TransformerException;
81 import javax.xml.xpath.XPathExpressionException;
82 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
83 import org.jdom.Document;
84 import org.jdom.JDOMException;
85 import org.xml.sax.SAXException;
98 Collection<CorpusData> cdc;
102 static URL basedirectory;
104 static Collection<String> allExistingCFs;
106 static Collection<String> chosencorpusfunctions =
new ArrayList<String>();
107 static Collection<CorpusFunction> corpusfunctions =
new ArrayList<CorpusFunction>();
109 static Collection<Class<? extends CorpusData>> neededcorpusdatatypes =
new ArrayList<Class<? extends CorpusData>>();
113 static ArrayList<URL> alldata =
new ArrayList<URL>();
115 static boolean fixing =
false;
116 static boolean iserrorsonly =
false;
117 static boolean isfixesjson =
false;
118 static CommandLine cmd = null;
121 static Properties cfProperties =
new Properties();
123 static String settingsfilepath =
"settings.xml";
125 static String fsm =
"fsm";
126 static String segmentation =
"segmentation";
127 static String lang =
"lang";
128 static String spelllang =
"spelllang";
129 static String corpusname =
"corpusname";
130 static String kml =
"kml";
131 static String mode =
"mode";
132 static URL reportlocation;
134 static boolean isCorpus =
false;
135 static boolean isCollection =
false;
142 public static void main(String[] args) {
146 System.out.println(
"CorpusMagician is now doing its magic.");
150 createCommandLineOptions(args);
157 for (Class<? extends CorpusData> cecd : cf.getIsUsableFor()) {
158 if (!neededcorpusdatatypes.contains(cecd)) {
159 neededcorpusdatatypes.add(cecd);
173 report = corpuma.runChosencorpusfunctions();
175 }
catch (MalformedURLException ex) {
176 report.addException(ex,
"The given URL was incorrect");
177 }
catch (IOException ex) {
178 report.addException(ex,
"A file could not be read");
179 }
catch (ParserConfigurationException ex) {
180 report.addException(ex,
"A file could not be parsed");
181 }
catch (TransformerException ex) {
182 report.addException(ex,
"A transformation error occured");
183 }
catch (SAXException ex) {
184 report.addException(ex,
"An XSLT error occured");
185 }
catch (JexmaraldaException ex) {
186 report.addException(ex,
"An Exmaralda file reading error occured");
187 }
catch (URISyntaxException ex) {
188 report.addException(ex,
"A URI was incorrect");
189 }
catch (XPathExpressionException ex) {
190 report.addException(ex,
"An Xpath expression was incorrect");
191 }
catch (ClassNotFoundException ex) {
192 report.addException(ex,
"Class not found");
193 }
catch (JDOMException ex) {
194 report.addException(ex,
"JDOM error");
214 allExistingCFs.add(cf.getClass().getName());
219 public void initDataWithURL(URL url, Collection<Class<? extends CorpusData>> clcds) throws MalformedURLException, SAXException, JexmaraldaException, URISyntaxException, IOException, ClassNotFoundException, JDOMException {
220 if (cio.isDirectory(url)) {
223 cdc = cio.read(url, clcds);
234 System.out.println(
"It's a corpus");
237 corpus =
new Corpus((ComaData) cdata, clcds);
249 return cio.URLtoList(url);
257 allExistingCFs =
new ArrayList<String>();
258 allExistingCFs.add(
"ComaApostropheChecker");
259 allExistingCFs.add(
"ComaNSLinksChecker");
260 allExistingCFs.add(
"ComaOverviewGeneration");
261 allExistingCFs.add(
"ComaChartsGeneration");
262 allExistingCFs.add(
"ZipCorpus");
263 allExistingCFs.add(
"HandlePidRegistration");
264 allExistingCFs.add(
"RemoveUnlinkedFiles");
265 allExistingCFs.add(
"ComaSegmentCountChecker");
266 allExistingCFs.add(
"ExbFileReferenceChecker");
267 allExistingCFs.add(
"ExbFileCoverageChecker");
268 allExistingCFs.add(
"ExbAnnotationPanelCheck");
269 allExistingCFs.add(
"EXB2INELISOTEI");
270 allExistingCFs.add(
"EXB2HIATISOTEI");
271 allExistingCFs.add(
"ExbStructureChecker");
272 allExistingCFs.add(
"ComaFileCoverageChecker");
273 allExistingCFs.add(
"NormalizeEXB");
274 allExistingCFs.add(
"PrettyPrintData");
275 allExistingCFs.add(
"RemoveAbsolutePaths");
276 allExistingCFs.add(
"RemoveAutoSaveExb");
277 allExistingCFs.add(
"XSLTChecker");
278 allExistingCFs.add(
"ComaAddTiersFromExbsCorrector");
279 allExistingCFs.add(
"ComaXsdChecker");
280 allExistingCFs.add(
"NgexmaraldaCorpusChecker");
281 allExistingCFs.add(
"FilenameChecker");
282 allExistingCFs.add(
"CmdiChecker");
283 allExistingCFs.add(
"ComaTiersDescriptionAnnotationPanelChecker");
284 allExistingCFs.add(
"ExbTierDisplayNameChecker");
285 allExistingCFs.add(
"NgTierCheckerWithAnnotation");
287 allExistingCFs.add(
"GenerateAnnotationPanel");
288 allExistingCFs.add(
"CorpusDataRegexReplacer");
289 allExistingCFs.add(
"ScoreHTML");
290 allExistingCFs.add(
"HScoreHTML");
291 allExistingCFs.add(
"CorpusHTML");
292 allExistingCFs.add(
"IAAFunctionality");
293 allExistingCFs.add(
"ListHTML");
294 allExistingCFs.add(
"ExbEventLinebreaksChecker");
295 allExistingCFs.add(
"MakeTimelineConsistent");
296 allExistingCFs.add(
"ExbSegmentationChecker");
297 allExistingCFs.add(
"CalculateAnnotatedTime");
298 allExistingCFs.add(
"AddCSVMetadataToComa");
299 allExistingCFs.add(
"ComaKmlForLocations");
300 allExistingCFs.add(
"RemoveEmptyEvents");
301 allExistingCFs.add(
"ComaTranscriptionsNameChecker");
302 allExistingCFs.add(
"ComaTierOverviewCreator");
303 allExistingCFs.add(
"GeneralTransformer");
304 allExistingCFs.add(
"ComaFedoraIdentifierLengthChecker");
305 allExistingCFs.add(
"ExbMP3Next2WavAdder");
306 allExistingCFs.add(
"ExbRefTierChecker");
307 allExistingCFs.add(
"ReportStatistics");
308 allExistingCFs.add(
"ExbSegmenter");
309 allExistingCFs.add(
"ExbScriptMixChecker");
310 allExistingCFs.add(
"DuplicateTierContentChecker");
311 allExistingCFs.add(
"LanguageToolChecker");
312 allExistingCFs.add(
"VikusViewer");
313 Collections.sort((List<String>) allExistingCFs);
314 return allExistingCFs;
320 String s = it.next();
321 all = all +
"\n" + s;
336 Collection<CorpusFunction> usablecorpusfunctions = null;
337 return usablecorpusfunctions;
343 Collection<CorpusFunction> defaultcorpusfunctions = null;
344 return defaultcorpusfunctions;
349 chosencorpusfunctions = null;
351 return chosencorpusfunctions;
355 Collection<CorpusFunction> cf2strcorpusfunctions =
new ArrayList<CorpusFunction>();
356 for (String
function : corpusfunctionstrings) {
357 switch (
function.toLowerCase()) {
358 case "comaapostrophechecker":
360 cf2strcorpusfunctions.add(cac);
362 case "comanslinkschecker":
364 cf2strcorpusfunctions.add(cnslc);
366 case "comaoverviewgeneration":
368 if (cfProperties != null) {
370 if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).toLowerCase().equals(
"inel")) {
372 System.out.println(
"Mode set to inel");
375 cf2strcorpusfunctions.add(cog);
377 case "comachartsgeneration":
380 if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).toLowerCase().equals(
"inel")) {
382 System.out.println(
"Mode set to inel");
385 cf2strcorpusfunctions.add(coc);
387 case "comasegmentcountchecker":
389 cf2strcorpusfunctions.add(cscc);
391 case "exbfilereferencechecker":
393 cf2strcorpusfunctions.add(efrc);
395 case "exbfilecoveragechecker":
397 cf2strcorpusfunctions.add(efcc);
399 case "exbannotationpanelcheck":
401 cf2strcorpusfunctions.add(eapc);
403 case "comafilecoveragechecker":
405 if (cfProperties != null) {
407 if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).equals(
"inel")) {
412 System.out.println(
"Mode set to inel");
415 cf2strcorpusfunctions.add(fcc);
417 case "prettyprintdata":
419 cf2strcorpusfunctions.add(pd);
421 case "removeabsolutepaths":
423 cf2strcorpusfunctions.add(rap);
425 case "removeautosaveexb":
427 cf2strcorpusfunctions.add(rase);
431 if (cfProperties != null) {
433 if (cfProperties.containsKey(mode) && cfProperties.getProperty(mode).toLowerCase().equals(
"inel")) {
435 System.out.println(
"Mode set to inel");
437 if (cfProperties.containsKey(fsm)) {
439 System.out.println(
"FSM set to " + cfProperties.getProperty(fsm));
442 cf2strcorpusfunctions.add(xc);
444 case "comaaddtiersfromexbscorrector":
446 cf2strcorpusfunctions.add(catfec);
448 case "comaxsdchecker":
450 cf2strcorpusfunctions.add(cxsd);
452 case "ngexmaraldacorpuschecker":
454 cf2strcorpusfunctions.add(ngex);
456 case "filenamechecker":
458 cf2strcorpusfunctions.add(fnc);
462 cf2strcorpusfunctions.add(cmdi);
464 case "comafedoraidentifierlengthchecker":
466 cf2strcorpusfunctions.add(cplc);
468 case "comatranscriptionsnamechecker":
470 cf2strcorpusfunctions.add(cnc);
472 case "comatiersdescriptionannotationpanelchecker":
474 cf2strcorpusfunctions.add(tcwa);
476 case "exbtierdisplaynamechecker":
478 cf2strcorpusfunctions.add(tc);
480 case "ngtiercheckerwithannotation":
482 cf2strcorpusfunctions.add(ngtcwa);
484 case "exb2inelisotei":
487 if (cfProperties != null) {
489 if (cfProperties.containsKey(lang)) {
491 System.out.println(
"Language set to " + cfProperties.getProperty(lang));
493 if (cfProperties.containsKey(fsm)) {
494 eiit.
setFSM(cfProperties.getProperty(fsm));
495 System.out.println(
"FSM set to " + cfProperties.getProperty(fsm));
498 cf2strcorpusfunctions.add(eiit);
501 case "exb2inelisoteisel":
504 if (cfProperties.containsKey(fsm)) {
505 eiitsel.
setFSM(cfProperties.getProperty(fsm));
506 System.out.println(
"FSM set to " + cfProperties.getProperty(fsm));
509 cf2strcorpusfunctions.add(eiitsel);
511 case "exb2inelisoteidlg":
514 if (cfProperties.containsKey(fsm)) {
515 eiitdlg.
setFSM(cfProperties.getProperty(fsm));
516 System.out.println(
"FSM set to " + cfProperties.getProperty(fsm));
519 cf2strcorpusfunctions.add(eiitdlg);
521 case "exb2inelisoteixas":
524 if (cfProperties.containsKey(fsm)) {
525 eiitxas.
setFSM(cfProperties.getProperty(fsm));
526 System.out.println(
"FSM set to " + cfProperties.getProperty(fsm));
529 cf2strcorpusfunctions.add(eiitxas);
531 case "exb2hiatisotei":
533 if (cfProperties != null) {
535 if (cfProperties.containsKey(lang)) {
537 System.out.println(
"Language set to " + cfProperties.getProperty(lang));
539 if (cfProperties.containsKey(mode)) {
540 if (cfProperties.getProperty(mode).toLowerCase().equals(
"inel")) {
542 System.out.println(
"Mode set to inel");
543 }
else if (cfProperties.getProperty(mode).toLowerCase().equals(
"token")) {
545 System.out.println(
"Mode set to token");
548 if (cfProperties.containsKey(fsm)) {
549 ehit.
setFSM(cfProperties.getProperty(fsm));
550 System.out.println(
"FSM set to " + cfProperties.getProperty(fsm));
553 cf2strcorpusfunctions.add(ehit);
557 if (cfProperties != null) {
559 if (cfProperties.containsKey(
"whitespace")) {
561 System.out.println(
"FixWhitespace set to " + cfProperties.getProperty(
"whitespace"));
564 cf2strcorpusfunctions.add(ne);
566 case "generateannotationpanel":
568 cf2strcorpusfunctions.add(gap);
570 case "iaafunctionality":
572 cf2strcorpusfunctions.add(iaa);
574 case "comakmlforlocations":
576 if (cfProperties != null) {
578 if (cfProperties.containsKey(kml)) {
580 System.out.println(
"KML file path set to " + cfProperties.getProperty(kml));
583 cf2strcorpusfunctions.add(ckml);
585 case "reportstatistics":
587 cf2strcorpusfunctions.add(rs);
589 case "corpusdataregexreplacer":
593 if (cfProperties != null) {
595 if (cfProperties.containsKey(
"replace")) {
596 cdrr.
setReplace(cfProperties.getProperty(
"replace"));
597 System.out.println(
"Replace set to " + cfProperties.getProperty(
"replace"));
599 if (cfProperties.containsKey(
"replacement")) {
601 System.out.println(
"Replacement set to " + cfProperties.getProperty(
"replacement"));
603 if (cfProperties.containsKey(
"xpathcontext")) {
605 System.out.println(
"Xpath set to " + cfProperties.getProperty(
"xpathcontext"));
607 if (cfProperties.containsKey(
"coma")) {
608 cdrr.
setComa(cfProperties.getProperty(
"coma"));
609 System.out.println(
"Replace in Coma set to " + cfProperties.getProperty(
"coma"));
612 cf2strcorpusfunctions.add(cdrr);
616 if (cfProperties != null) {
618 if (cfProperties.containsKey(
"source_folder")) {
620 System.out.println(
"Location of source folder set to " + cfProperties.getProperty(
"source_folder"));
622 if (cfProperties.containsKey(
"output_zip_file")) {
623 zc.
setOutputFile(cfProperties.getProperty(
"output_zip_file"));
624 System.out.println(
"Location of output file set to " + cfProperties.getProperty(
"output_zip_file"));
626 if (cfProperties.containsKey(
"audio")) {
628 System.out.println(
"Should contain audio set to " + cfProperties.getProperty(
"audio"));
631 cf2strcorpusfunctions.add(zc);
633 case "handlepidregistration":
635 if (cfProperties != null) {
637 if (cfProperties.containsKey(
"user")) {
638 hppr.
setUser(cfProperties.getProperty(
"user"));
639 System.out.println(
"User set to " + cfProperties.getProperty(
"user"));
641 if (cfProperties.containsKey(
"pass")) {
642 hppr.
setPass(cfProperties.getProperty(
"pass"));
643 System.out.println(
"Password set to " + cfProperties.getProperty(
"pass").replaceAll(
".",
"*"));
646 if (cfProperties.containsKey(
"prefix")) {
648 System.out.println(
"Prefix set to " + cfProperties.getProperty(
"prefix"));
651 cf2strcorpusfunctions.add(hppr);
653 case "removeunlinkedfiles":
655 cf2strcorpusfunctions.add(ruf);
659 if (cfProperties != null) {
660 if (cfProperties.containsKey(corpusname)) {
662 System.out.println(
"Corpus name set to " + cfProperties.getProperty(corpusname));
665 cf2strcorpusfunctions.add(shtml);
669 cf2strcorpusfunctions.add(hshtml);
673 cf2strcorpusfunctions.add(chtml);
677 if (cfProperties != null) {
679 if (cfProperties.containsKey(segmentation)) {
681 System.out.println(
"Segmentation set to " + cfProperties.getProperty(segmentation));
683 if (cfProperties.containsKey(corpusname)) {
685 System.out.println(
"Corpus name set to " + cfProperties.getProperty(corpusname));
687 if (cfProperties.containsKey(fsm)) {
689 System.out.println(
"External FSM path set to " + cfProperties.getProperty(fsm));
692 cf2strcorpusfunctions.add(lhtml);
694 case "exbeventlinebreakschecker":
696 cf2strcorpusfunctions.add(elb);
698 case "maketimelineconsistent":
700 if (cfProperties != null) {
702 if (cfProperties.containsKey(
"interpolate")) {
704 System.out.println(
"FixWhitespace set to " + cfProperties.getProperty(
"interpolate"));
707 cf2strcorpusfunctions.add(emtc);
709 case "exbstructurechecker":
711 cf2strcorpusfunctions.add(esc);
713 case "exbsegmentationchecker":
715 if (cfProperties != null) {
717 if (cfProperties.containsKey(segmentation)) {
719 System.out.println(
"Segmentation set to " + cfProperties.getProperty(segmentation));
721 if (cfProperties.containsKey(fsm)) {
723 System.out.println(
"External FSM path set to " + cfProperties.getProperty(fsm));
726 cf2strcorpusfunctions.add(eseg);
730 if (cfProperties != null) {
732 if (cfProperties.containsKey(segmentation)) {
734 System.out.println(
"Segmentation set to " + cfProperties.getProperty(segmentation));
736 if (cfProperties.containsKey(fsm)) {
738 System.out.println(
"External FSM path set to " + cfProperties.getProperty(fsm));
741 cf2strcorpusfunctions.add(esegr);
743 case "calculateannotatedtime":
745 cf2strcorpusfunctions.add(cat);
747 case "addcsvmetadatatocoma":
749 if (cfProperties != null) {
751 if (cfProperties.containsKey(
"csv")) {
753 System.out.println(
"CSV file path set to " + cfProperties.getProperty(
"csv"));
755 if (cfProperties.containsKey(
"speaker")) {
757 System.out.println(
"CSV file set for " + cfProperties.getProperty(
"speaker"));
760 cf2strcorpusfunctions.add(acmtc);
762 case "removeemptyevents":
764 cf2strcorpusfunctions.add(ree);
766 case "comatieroverviewcreator":
768 cf2strcorpusfunctions.add(ctoc);
770 case "generaltransformer":
772 if (cfProperties != null) {
773 if (cfProperties.containsKey(
"coma")) {
774 gt.
setComa(cfProperties.getProperty(
"coma"));
775 System.out.println(
"Run on Coma set to " + cfProperties.getProperty(
"coma"));
777 if (cfProperties.containsKey(
"exb")) {
778 gt.
setExb(cfProperties.getProperty(
"exb"));
779 System.out.println(
"Run on exb set to " + cfProperties.getProperty(
"exb"));
781 if (cfProperties.containsKey(
"exs")) {
782 gt.
setExs(cfProperties.getProperty(
"exs"));
783 System.out.println(
"Run on exs set to " + cfProperties.getProperty(
"exs"));
785 if (cfProperties.containsKey(
"xsl")) {
787 System.out.println(
"Path to XSL set to " + cfProperties.getProperty(
"xsl"));
789 if (cfProperties.containsKey(
"overwritefiles")) {
791 System.out.println(
"overwritefiles set to " + cfProperties.getProperty(
"overwritefiles"));
794 cf2strcorpusfunctions.add(gt);
796 case "exbmp3next2wavadder":
798 cf2strcorpusfunctions.add(emn2wa);
800 case "exbreftierchecker":
802 cf2strcorpusfunctions.add(ertc);
804 case "exbscriptmixchecker":
806 cf2strcorpusfunctions.add(esmc);
808 case "duplicatetiercontentchecker":
810 cf2strcorpusfunctions.add(duplc);
811 if (cfProperties != null) {
813 if (cfProperties.containsKey(
"tiers")) {
815 System.out.println(
"Tier names set to " + cfProperties.getProperty(
"tiers"));
819 case "languagetoolchecker":
821 if (cfProperties != null) {
823 if (cfProperties.containsKey(spelllang)) {
824 ltc.
setLanguage(cfProperties.getProperty(spelllang));
825 System.out.println(
"Language set to " + cfProperties.getProperty(spelllang));
827 if (cfProperties.containsKey(
"tier")) {
829 System.out.println(
"Tier to check set to " + cfProperties.getProperty(
"tier"));
832 cf2strcorpusfunctions.add(ltc);
836 cf2strcorpusfunctions.add(vv);
839 report.addCritical(
"CommandlineFunctionality",
"Function String \"" +
function +
"\" is not recognized");
842 return cf2strcorpusfunctions;
846 Report runChosencorpusfunctions() {
857 }
else if (isCorpus) {
885 report.
merge(newReport);
909 Collection<Class<? extends CorpusData>> usableTypes = cf.
getIsUsableFor();
914 if (usableTypes.contains(cd.getClass())) {
916 report.
merge(newReport);
936 Collection<Class<? extends CorpusData>> usableTypes = cf.
getIsUsableFor();
940 if (usableTypes.contains(cd.getClass())) {
942 report.
merge(newReport);
965 Report newReport = (cf.execute(cd));
966 report.
merge(newReport);
974 Report newReport = (cf.execute(cd, fix));
975 report.
merge(newReport);
988 this.corpusData = corpusData;
1004 return chosencorpusfunctions;
1007 public static void createReports() throws IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, SAXException, XPathExpressionException, JDOMException {
1008 System.out.println(report.getFullReports());
1009 String reportOutput;
1010 if (reportlocation.getFile().endsWith(
"html")) {
1020 reportOutput = report.getSummaryLines() +
"\n" + report.getFullReports();
1022 String absoluteReport = reportOutput;
1023 if (absoluteReport != null && basedirectory != null && absoluteReport.contains(basedirectory.toString())) {
1024 absoluteReport = reportOutput.replaceAll(basedirectory.toString(),
"");
1026 if (absoluteReport != null) {
1027 cio.write(absoluteReport, reportlocation);
1030 System.out.println(
"Basedirectory is " + basedirectory);
1031 System.out.println(
"BasedirectoryPath is " + basedirectory.getPath());
1032 URL errorlistlocation =
new URL(basedirectory +
"curation/CorpusServices_Errors.xml");
1033 URL fixJsonlocation =
new URL(basedirectory +
"curation/fixes.json");
1034 File curationFolder =
new File((
new URL(basedirectory +
"curation").getFile()));
1035 if (!curationFolder.exists()) {
1037 curationFolder.mkdirs();
1041 if (exmaErrorListString != null && basedirectory != null && exmaErrorListString.contains(basedirectory.getPath())) {
1042 exmaErrorListString = exmaErrorListString.replaceAll(basedirectory.getPath(),
"../");
1044 if (exmaErrorListString != null) {
1045 exmaErrorListString = pp.
indent(exmaErrorListString,
"event");
1046 cio.write(exmaErrorListString, errorlistlocation);
1047 System.out.println(
"Wrote ErrorList at " + errorlistlocation);
1050 String fixJson =
"";
1052 fixJson = report.getFixJson(corpus);
1054 fixJson = report.getFixJson();
1056 if (fixJson != null) {
1057 cio.write(fixJson, fixJsonlocation);
1058 System.out.println(
"Wrote JSON file for fixes at " + fixJsonlocation);
1064 String urlstring = cmd.getOptionValue(
"input");
1065 fixing = cmd.hasOption(
"f");
1066 iserrorsonly = cmd.hasOption(
"e");
1067 isfixesjson = cmd.hasOption(
"j");
1068 if (urlstring.startsWith(
"file://")) {
1069 inputurl =
new URL(urlstring);
1071 inputurl = Paths.get(urlstring).toUri().toURL();
1075 String reportstring = cmd.getOptionValue(
"output");
1076 if (reportstring.startsWith(
"file://")) {
1077 reportlocation =
new URL(reportstring);
1079 reportlocation = Paths.get(reportstring).toUri().toURL();
1082 String[] corpusfunctionarray = cmd.getOptionValues(
"c");
1083 for (String cf : corpusfunctionarray) {
1086 System.out.println(
CorpusMagician.chosencorpusfunctions.toString());
1089 private static void createCommandLineOptions(String[] args)
throws FileNotFoundException, IOException {
1090 Options options =
new Options();
1092 Option input =
new Option(
"i",
"input",
true,
"input file path (coma file for corpus, folder or other file for unstructured data)");
1093 input.setRequired(
true);
1094 input.setArgName(
"FILE PATH");
1095 options.addOption(input);
1097 Option output =
new Option(
"o",
"output",
true,
"output file");
1098 output.setRequired(
true);
1099 output.setArgName(
"FILE PATH");
1100 options.addOption(output);
1102 Option corpusfunction =
new Option(
"c",
"corpusfunction",
true,
"corpus function");
1104 corpusfunction.setArgs(Option.UNLIMITED_VALUES);
1105 corpusfunction.setArgName(
"CORPUS FUNCTION");
1106 corpusfunction.setRequired(
true);
1107 corpusfunction.setValueSeparator(
',');
1108 options.addOption(corpusfunction);
1115 Option propertyOption = Option.builder(
"p")
1116 .longOpt(
"property")
1117 .argName(
"property=value")
1121 .desc(
"use value for given properties")
1124 options.addOption(propertyOption);
1126 Option fix =
new Option(
"f",
"fix",
false,
"fixes problems automatically");
1127 fix.setRequired(
false);
1128 options.addOption(fix);
1130 Option help =
new Option(
"h",
"help",
false,
"display help");
1131 fix.setRequired(
false);
1132 options.addOption(help);
1134 Option errorsonly =
new Option(
"e",
"errorsonly",
false,
"output only errors");
1135 fix.setRequired(
false);
1136 options.addOption(errorsonly);
1138 Option fixesjson =
new Option(
"j",
"fixesjson",
false,
"output json file for fixes");
1139 fix.setRequired(
false);
1140 options.addOption(fixesjson);
1142 Option settingsfile =
new Option(
"s",
"settingsfile",
true,
"settings file path");
1143 settingsfile.setRequired(
false);
1144 settingsfile.setArgName(
"FILE PATH");
1145 options.addOption(settingsfile);
1147 CommandLineParser parser =
new DefaultParser();
1148 HelpFormatter formatter =
new HelpFormatter();
1149 formatter.setOptionComparator(null);
1151 String header =
"Specify a corpus folder or file and a function to be applied\n\n";
1153 String footerverbose =
"\nthe available functions are:\n" +
getAllExistingCFsAsString() +
"\n\nDescriptions of the available functions follow:\n\n";
1158 desc = cf.getFunction() +
": " + cf.getDescription();
1159 usable =
"\nThe function can be used on:\n";
1160 for (Class cl : cf.getIsUsableFor()) {
1161 usable += cl.getSimpleName() +
" ";
1163 hasfix =
"\nThe function has a fixing option: " + cf.getCanFix().toString();
1164 footerverbose += desc + hasfix + usable +
"\n\n";
1167 footerverbose +=
"\n\nPlease report issues at https://lab.multilingua.uni-hamburg.de/redmine/projects/corpus-services/issues";
1169 cmd = parser.parse(options, args);
1170 }
catch (ParseException e) {
1171 System.out.println(e.getMessage());
1172 formatter.printHelp(
"hzsk-corpus-services", header, options, footerverbose,
true);
1178 if (cmd.hasOption(
"h")) {
1180 formatter.printHelp(
"hzsk-corpus-services", header, options, footerverbose,
true);
1184 if (cmd.hasOption(
"p")) {
1185 if (cmd.hasOption(
"s")) {
1186 System.out.println(
"Options s and p for parameters are not allowed at the same time!!");
1187 formatter.printHelp(
"hzsk-corpus-services", header, options, footerverbose,
true);
1190 cfProperties = cmd.getOptionProperties(
"p");
1193 if (cmd.hasOption(
"s")) {
1195 settingsfilepath = cmd.getOptionValue(
"s");
1198 settingsfilepath =
"settings.param";
1201 if (
new File(settingsfilepath).exists()) {
1202 FileInputStream test =
new FileInputStream(settingsfilepath);
1203 cfProperties.loadFromXML(test);
1204 System.out.println(
"Properties are: " + cfProperties);
1206 System.out.println(
"No parameters loaded.");
void setUser(String user)
void setLanguage(String lang)
void setfixWhiteSpaces(String s)
void setHandlePrefix(String prefix)
Report execute(CorpusData cd)
Report runCorpusFunction(CorpusData cd, CorpusFunction cf)
void setExternalFSM(String s)
Collection< CorpusFunction > getDefaultUsableFunctions()
void addFileEndingWhiteListString(String s)
void setInterpolateTimeline(String s)
void setPass(String pass)
void setExternalFSM(String s)
Report runCorpusFunction(CorpusFunction cf)
static Collection< String > getAllExistingCFs()
void setWithAudio(String s)
Report runCorpusFunction(Corpus c, CorpusFunction cf)
void setSegmentation(String s)
void setCorpusName(String s)
Collection< CorpusData > getCorpusData()
static String generateDataTableHTML(Collection< ReportItem > errors, String summarylines)
static Collection< CorpusFunction > getAllExistingCFsAsCFs()
void setReplacement(String s)
String indent(String xml, String suppressedElements)
void setOutputFile(String s)
static org.jdom.Document W3cDocument2JdomDocument(org.w3c.dom.Document input)
static Report runCorpusFunctions(CorpusData cd, Collection< CorpusFunction > cfc)
static ExmaErrorList exmaError
Report runCorpusFunction(Corpus c, CorpusFunction cf, boolean fix)
void setFSMpath(String s)
void setTierNames(String sTiers)
void setFSM(String newfsm)
CorpusData getCorpusData()
static void readCommandLineOptions()
static void main(String[] args)
Collection< Class<?extends CorpusData > > getIsUsableFor()
static void createReports()
void setSourceFolder(String s)
void setCorpusName(String s)
void registerCorpusFunction(CorpusFunction cf)
void addWhiteListString(String s)
static Report runCorpusFunctions(CorpusData cd, Collection< CorpusFunction > cfc, boolean fix)
Report runCorpusFunction(CorpusData cd, CorpusFunction cf, boolean fix)
void setXSLresource(String s)
void setCorpusData(CorpusData corpusData)
static Collection< CorpusFunction > corpusFunctionStrings2Classes(Collection< String > corpusfunctionstrings)
void setSegmentation(String s)
void setChosencorpusfunctions(Collection< String > chosencorpusfunctions)
void setReplace(String s)
Collection< String > getChosencorpusfunctions()
Report runCorpusFunction(Collection< CorpusData > cdc, CorpusFunction cf)
static String JdomDocument2String(org.jdom.Document jdomDocument)
Collection< CorpusFunction > getUsableFunctions(CorpusData cd)
Report runCorpusFunctions(Collection< CorpusFunction > cfc)
void initDataWithURL(URL url, Collection< Class<?extends CorpusData >> clcds)
static Document createFullErrorList()
Collection< URL > createListofData(URL url)
static String getAllExistingCFsAsString()
void writeConfig(URL url)
void setKMLFilePath(String path)
Report runCorpusFunctions(Corpus c, Collection< CorpusFunction > cfc)
Report runCorpusFunction(Collection< CorpusData > cdc, CorpusFunction cf, boolean fix)
void setXpathContext(String s)
Collection< String > chooseFunctionDialog()