corpus-services  1.0
ExbEventLinebreaksChecker.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.validation;
7 
13 import java.io.IOException;
14 import java.util.Collection;
15 import java.util.regex.Pattern;
16 import org.jdom.Document;
17 import org.jdom.JDOMException;
18 import org.jdom.xpath.XPath;
19 import org.xml.sax.SAXException;
21 import java.net.URISyntaxException;
22 import java.util.List;
23 import javax.xml.parsers.ParserConfigurationException;
24 import javax.xml.transform.TransformerException;
25 import javax.xml.xpath.XPathExpressionException;
26 import org.jdom.Element;
27 import static org.apache.commons.lang3.StringEscapeUtils.escapeHtml4;
28 
38 public class ExbEventLinebreaksChecker extends Checker implements CorpusFunction {
39 
40  boolean linebreak = false;
41  String xpathContext = "//event";
42  XPath context;
43  Document doc;
44 
46  //fixing option available
47  super(true);
48  }
49 
55  @Override
56  public Report function(CorpusData cd, Boolean fix) // check whether there's any illegal apostrophes '
57  throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
58  Report stats = new Report(); // create a new report
59  doc = TypeConverter.String2JdomDocument(cd.toSaveableString()); // read the file as a doc
60  Pattern replacePattern = Pattern.compile("[\r\n]");
61  context = XPath.newInstance(xpathContext);
62  List allContextInstances = context.selectNodes(doc);
63  CorpusIO cio = new CorpusIO();
64  String s = "";
65  if (!allContextInstances.isEmpty()) {
66  for (int i = 0; i < allContextInstances.size(); i++) {
67  Object o = allContextInstances.get(i);
68  if (o instanceof Element) {
69  Element e = (Element) o;
70  s = e.getText();
71  if (replacePattern.matcher(s).find()) { // if file contains the RegEx then issue warning
72  linebreak = true;
73  if (fix) {
74  String snew = s.replaceAll("[\r\n]", ""); //replace all replace with replacement
75  //TODO Attributes?
76  e.setText(snew);
77  cd.updateUnformattedString(doc.toString());
78  cio.write(cd, cd.getURL());
79  stats.addFix(function, cd, "Removed line ending in an event: " + escapeHtml4(s) + " with " + escapeHtml4(snew));
80  } else {
81  System.out.println("Exb is containing line ending in an event: " + escapeHtml4(s));
82  stats.addCritical(function, cd, "Exb is containing line ending in an event: " + escapeHtml4(s));
83  }
84  }
85  }
86  }
87  if (!linebreak) {
88  stats.addCorrect(function, cd, "CorpusData file does not contain line ending in an event");
89  }
90  } else {
91  stats.addCorrect(function, cd, "CorpusData file does not contain any event");
92  }
93  return stats; // return the report with warnings
94  }
95 
101  @Override
102  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
103  try {
104  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
105  IsUsableFor.add(cl);
106  } catch (ClassNotFoundException ex) {
107  report.addException(ex, "Usable class not found.");
108  }
109  return IsUsableFor;
110  }
111 
116  @Override
117  public String getDescription() {
118  String description = "This class issues warnings if the exb file contains "
119  + "linebreaks or fixes linebreaks in the events and adds those "
120  + "warnings to the report which it returns.";
121  return description;
122  }
123 
124  @Override
125  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
126  Report stats = new Report();
127  for (CorpusData cdata : c.getBasicTranscriptionData()) {
128  stats.merge(function(cdata, fix));
129  }
130  return stats;
131  }
132 
133 }
void merge(Report sr)
Definition: Report.java:73
void addCritical(String description)
Definition: Report.java:104
void addCorrect(String statId, String description)
Definition: Report.java:217
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155