corpus-services  1.0
CorpusDataRegexReplacer.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.validation;
7 
15 import java.io.IOException;
16 import java.net.URISyntaxException;
17 import java.util.Collection;
18 import java.util.List;
19 import java.util.regex.Pattern;
20 import javax.xml.parsers.ParserConfigurationException;
21 import javax.xml.transform.TransformerException;
22 import javax.xml.xpath.XPathExpressionException;
23 import org.jdom.Attribute;
24 import org.jdom.Document;
25 import org.jdom.Element;
26 import org.jdom.JDOMException;
27 import org.jdom.xpath.XPath;
28 import org.xml.sax.SAXException;
29 import static org.apache.commons.lang3.StringEscapeUtils.escapeHtml4;
30 
35 public class CorpusDataRegexReplacer extends Checker implements CorpusFunction {
36 
37  //ToDo
38  boolean containsRegEx = false;
39  String replace = "'";
40  String replacement = "ยด";
41  boolean coma = false;
42  String xpathContext = "//*";
43  Document doc = null;
44  XPath context;
45 
47  super(true);
48  }
49 
55  @Override
56  public Report function(CorpusData cd, Boolean fix) // check whether there's any regEx instances on specified XPath
57  throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
58 
59  Report stats = new Report(); // create a new report
60  doc = TypeConverter.String2JdomDocument(cd.toSaveableString()); // read the file as a doc
61  Pattern replacePattern = Pattern.compile(replace);
62  context = XPath.newInstance(xpathContext);
63  List allContextInstances = context.selectNodes(doc);
64  String s;
65 
66  if (!allContextInstances.isEmpty()) {
67  for (int i = 0; i < allContextInstances.size(); i++) {
68  Object o = allContextInstances.get(i);
69  if (o instanceof Element) {
70  Element e = (Element) o;
71  s = e.getText();
72  if (replacePattern.matcher(s).find()) { // if file contains the RegEx then issue warning
73  containsRegEx = true;
74  if (fix) {
75  String snew = s.replaceAll(replace, replacement); //replace all replace with replacement
76  //TODO Attributes?
77  e.setText(snew);
78  stats.addFix(function, cd, "Replaced " + escapeHtml4(replace) + " with " + escapeHtml4(replacement) + " at " + escapeHtml4(xpathContext) + " here: " + escapeHtml4(s) + " with " + escapeHtml4(snew));
79  } else {
80  System.out.println("CorpusData file is containing " + escapeHtml4(replace) + " at " + escapeHtml4(xpathContext) + ": " + escapeHtml4(s));
81  stats.addCritical(function, cd, "CorpusData file is containing " + escapeHtml4(replace) + " at " + escapeHtml4(xpathContext) + ": " + escapeHtml4(s));
82  }
83  }
84  } else if (o instanceof Attribute) {
85  Attribute a = (Attribute) o;
86  s = a.getValue();
87  if (fix) {
88  System.out.println("Attributes cannot be replaced yet at " + escapeHtml4(xpathContext));
89  stats.addCritical(function, cd, "Attributes cannot be replaced yet at " + escapeHtml4(xpathContext));
90  }
91  } else {
92  stats.addWarning(function, cd, "Xpath " + escapeHtml4(xpathContext) + " does not lead to Element or Attribute");
93  s = "";
94  }
95 
96  }
97  if (!containsRegEx) {
98  stats.addCorrect(function, cd, "CorpusData file does not contain " + escapeHtml4(replace) + " at " + escapeHtml4(xpathContext));
99  } else if (fix) {
100  CorpusIO cio = new CorpusIO();
101  cd.updateUnformattedString(TypeConverter.JdomDocument2String(doc));
102  cio.write(cd, cd.getURL());
103  } else {
104  stats.addCorrect(function, cd, "CorpusData file does not contain anything at " + escapeHtml4(xpathContext));
105  }
106  }
107  return stats; // return the report with warnings
108  }
109 
110  @Override
111  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
112  try {
113  if (coma) {
114  Class cl3 = Class.forName("de.uni_hamburg.corpora.ComaData");
115  IsUsableFor.add(cl3);
116  } else {
117  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
118  IsUsableFor.add(cl);
119  }
120  } catch (ClassNotFoundException ex) {
121  report.addException(ex, "Usable class not found.");
122  }
123  return IsUsableFor;
124  }
125 
126  public void setReplace(String s) {
127  replace = s;
128  }
129 
130  public void setReplacement(String s) {
131  replacement = s;
132  }
133 
134  public void setXpathContext(String s) {
135  xpathContext = s;
136  }
137 
138  public void setComa(String s) {
139  if (s.equalsIgnoreCase("true") || s.equalsIgnoreCase("wahr") || s.equalsIgnoreCase("ja")) {
140  coma = true;
141  } else if (s.equalsIgnoreCase("false") || s.equalsIgnoreCase("falsch") || s.equalsIgnoreCase("nein")) {
142  coma = false;
143  } else {
144  report.addCritical(function, cd, "Parameter coma not recognized: " + escapeHtml4(s));
145  }
146  }
147 
152  @Override
153  public String getDescription() {
154  String description = "This class issues warnings if a file contains a certain RegEx and can also replace";
155  return description;
156  }
157 
158  @Override
159  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
160  Report stats = new Report();
161  if (coma) {
162  cd = c.getComaData();
163  stats = function(cd, fix);
164 
165  } else {
166  for (BasicTranscriptionData btd : c.getBasicTranscriptionData()) {
167  stats.merge(function(btd, fix));
168  }
169  }
170  return stats;
171  }
172 }
Collection< Class<?extends CorpusData > > getIsUsableFor()
void merge(Report sr)
Definition: Report.java:73
void addCritical(String description)
Definition: Report.java:104
void addWarning(String statId, String description)
Definition: Report.java:164
void addCorrect(String statId, String description)
Definition: Report.java:217
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155