corpus-services  1.0
LanguageToolChecker.java
Go to the documentation of this file.
1 
10 package de.uni_hamburg.corpora.validation;
11 
19 import java.io.IOException;
20 import java.net.URISyntaxException;
21 import java.util.List;
22 import java.util.ArrayList;
23 import java.util.Collection;
24 import javax.xml.parsers.ParserConfigurationException;
25 import javax.xml.transform.TransformerException;
26 import javax.xml.xpath.XPathExpressionException;
27 import org.xml.sax.SAXException;
28 import org.w3c.dom.Document;
29 import org.w3c.dom.Element;
30 import org.w3c.dom.Node;
31 import org.w3c.dom.NodeList;
32 import org.w3c.dom.Text;
33 
34 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
35 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
36 import org.jdom.JDOMException;
37 
38 import org.languagetool.rules.RuleMatch;
39 import org.languagetool.JLanguageTool;
40 import org.languagetool.language.GermanyGerman;
41 //import org.languagetool.language.BritishEnglish;
42 //import org.languagetool.language.Russian;
43 
47 public class LanguageToolChecker extends Checker implements CorpusFunction {
48 
49  static String filename;
50  BasicTranscription bt;
51  static BasicTranscriptionData btd;
52  ValidatorSettings settings;
53  List<String> conventions = new ArrayList<String>();
54  List<String> problems = new ArrayList<String>();
55  String tierToCheck = "fg";
56  String language = "de";
57  JLanguageTool langTool;
58 
60 
64  super(false);
65  }
66 
71  @Override
72  public Report function(CorpusData cd, Boolean fix)
73  throws SAXException, IOException, ParserConfigurationException, JexmaraldaException {
74  Report stats = new Report();
75  btd = new BasicTranscriptionData(cd.getURL());
76  if (language.equals("de")) {
77  langTool = new JLanguageTool(new GermanyGerman());
78  System.out.println("Language set to German");
79  /*
80  } else if (language.equals("en")) {
81  //needs to be English!
82  //langTool = new JLanguageTool(new BritishEnglish());
83  //System.out.println("Language set to English");
84  } else if (language.equals("ru")) {
85  //needs to be Russian!
86  //langTool = new JLanguageTool(new Russian());
87  //System.out.println("Language set to Russian");
88  */
89  } else {
90  Report report = new Report();
91  report.addCritical(function, cd, "Missing languagetool resource for language "
92  + language);
93  return stats;
94  }
95  Document doc = TypeConverter.JdomDocument2W3cDocument(btd.getJdom());
96  NodeList tiers = doc.getElementsByTagName("tier");
97  List<RuleMatch> matches = new ArrayList<RuleMatch>();
98  int count = 0;
99  for (int k = 0; k < tiers.getLength(); k++) {
100  Element tier = (Element) tiers.item(k);
101  if (!tier.getAttribute("category").equals(tierToCheck)) {
102  continue;
103  }
104  NodeList events = tier.getElementsByTagName("event");
105  for (int i = 0; i < events.getLength(); i++) {
106  Element event = (Element) events.item(i);
107  NodeList eventTexts = event.getChildNodes();
108  for (int j = 0; j < eventTexts.getLength(); j++) {
109  Node maybeText = eventTexts.item(j);
110  if (maybeText.getNodeType() != Node.TEXT_NODE) {
111  if (maybeText.getNodeType() == Node.ELEMENT_NODE
112  && maybeText.getNodeName().equals("ud-information")) {
113  // XXX: ud-information is weird I'll just skip it...
114  continue;
115  }
116  System.out.println("This is not a text node: "
117  + maybeText);
118  continue;
119  }
120  Text eventText = (Text) maybeText;
121  String text = eventText.getWholeText();
122  matches = langTool.check(text);
123  for (RuleMatch match : matches) {
124  String message = "Potential error at characters "
125  + match.getFromPos() + "-" + match.getToPos() + ": "
126  + match.getMessage() + ": \""
127  + text.substring(match.getFromPos(),
128  match.getToPos()) + "\" "
129  + "Suggested correction(s): "
130  + match.getSuggestedReplacements();
131  stats.addWarning(function, cd, message
132  );
133 // System.out.println("Potential error at characters " +
134 // match.getFromPos() + "-" + match.getToPos() + ": " +
135 // match.getMessage() + ": \"" +
136 // text.substring(match.getFromPos(),
137 // match.getToPos()) + "\" " +
138 // "Suggested correction(s): " +
139 // match.getSuggestedReplacements());
140  //add ExmaError tierID eventID
141  exmaError.addError(function, cd.getURL().getFile(), tier.getAttribute("id"), event.getAttribute("start"), false, message);
142  }
143  if (!matches.isEmpty()) {
144  count++;
145  }
146  }
147 
148  }
149  }
150  if (count==0) {
151  stats.addCorrect(function, cd, "No spelling errors found.");
152  }
153  return stats;
154  }
155 
156 
162  @Override
163  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
164  try {
165  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
166  IsUsableFor.add(cl);
167  } catch (ClassNotFoundException ex) {
168  report.addException(ex, "unknown class not found error");
169  }
170  return IsUsableFor;
171  }
172 
173  @Override
174  public String getDescription() {
175  String description = "This class takes a CorpusDataObject that is an Exb, "
176  + "checks if there are spell or grammar errors in German, English or Russian using LanguageTool and"
177  + " returns the errors in the Report and in the ExmaErrors.";
178  return description;
179  }
180 
181  public void setLanguage(String lang) {
182  language = lang;
183  }
184 
185  public void setTierToCheck(String ttc) {
186  tierToCheck = ttc;
187  }
188 
189  @Override
190  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException, JexmaraldaException {
191  Report stats = new Report();
192  for (CorpusData cdata : c.getBasicTranscriptionData()) {
193  stats.merge(function(cdata, fix));
194  }
195  return stats;
196  }
197 }
void merge(Report sr)
Definition: Report.java:73
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addCritical(String description)
Definition: Report.java:104
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
Definition: Report.java:164
void addCorrect(String statId, String description)
Definition: Report.java:217
void addException(Throwable e, String description)
Definition: Report.java:287