corpus-services  1.0
ExbScriptMixChecker.java
Go to the documentation of this file.
1 
9 package de.uni_hamburg.corpora.validation;
10 
17 import java.io.IOException;
18 import java.net.URISyntaxException;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collection;
22 import javax.xml.parsers.ParserConfigurationException;
23 import javax.xml.transform.TransformerException;
24 import javax.xml.xpath.XPathExpressionException;
25 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
26 import org.jdom.JDOMException;
27 import org.xml.sax.SAXException;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30 import org.w3c.dom.Document;
31 import org.w3c.dom.Element;
32 import org.w3c.dom.NodeList;
33 import java.util.HashMap;
34 import java.util.Map;
35 
39 public class ExbScriptMixChecker extends Checker implements CorpusFunction {
40  ArrayList<String> lsTiersToCheck = new ArrayList<>(
41  Arrays.asList("tx", "mb", "mp", "ge"));
42  // Hardcoded list of tier names is bad. We'll have to replace it
43  // with a settings file or something like that.
44  static String sCharClassLat = "[a-zÀ-žḀ-ỹ]";
45  static String sCharClassCyr = "[Ѐ-ԯ]";
46  static String sCharClassGreek = "[΄-ϡϰ-Ͽἀ-῾]";
47  static String sCharClassArmenian = "[Ա-֏]";
48  static String sCharClassGeorgian = "[\u10a0-\u10ff]";
49  Pattern rxLat = Pattern.compile(sCharClassLat, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
50  Pattern rxCyr = Pattern.compile(sCharClassCyr, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
51  Pattern rxGreek = Pattern.compile(sCharClassGreek, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
52  Pattern rxArmenian = Pattern.compile(sCharClassArmenian, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
53  Pattern rxGeorgian = Pattern.compile(sCharClassGeorgian, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
54  Map<String, Pattern> dictScripts = new HashMap<>();
55 
57  //no fixing option available
58  super(false);
59  dictScripts.put("Cyrillic", rxCyr);
60  dictScripts.put("Latin", rxLat);
61  dictScripts.put("Greek", rxGreek);
62  dictScripts.put("Armenian", rxArmenian);
63  dictScripts.put("Georgian", rxGeorgian);
64  }
65 
66 
72  @Override
73  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
74  try {
75  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
76  IsUsableFor.add(cl);
77  } catch (ClassNotFoundException ex) {
78  report.addException(ex, " usable class not found");
79  }
80  return IsUsableFor;
81  }
82 
83 
84  @Override
85  public Report function(CorpusData cd, Boolean fix) throws IOException, SAXException {
86  //so this is easier this way :)
87  Document doc = null;
88  XMLData xml = (XMLData)cd;
90  Report stats = new Report(); // create a new report for the transcript
91 
92  NodeList tiers = doc.getElementsByTagName("tier"); // get all tiers of the transcript
93  ArrayList<Element> relevantTiers = new ArrayList();
94  for (int i = 0; i < tiers.getLength(); i++) {
95  Element tier = (Element)tiers.item(i);
96  String category = tier.getAttribute("category"); // get category so that we know is this is a relevant tier
97  if (lsTiersToCheck.contains(category)) {
98  relevantTiers.add(tier);
99  }
100  }
101  for (int i = 0; i < relevantTiers.size(); i++) {
102  Element curTier = relevantTiers.get(i);
103  NodeList events = curTier.getElementsByTagName("event");
104  String tierId = curTier.getAttribute("id");
105  String tierSpeaker = curTier.getAttribute("speaker");
106  int order = 1;
107 
108  for (int j = 0; j < events.getLength(); j++) {
109  Element event = (Element)events.item(j);
110  String eventStart = event.getAttribute("start");
111  String eventEnd = event.getAttribute("end");
112  String eventText = event.getTextContent();
113  ArrayList<String> lsScriptsUsed = new ArrayList<>();
114  for (Map.Entry<String, Pattern> entry : dictScripts.entrySet()) {
115  Pattern p = entry.getValue();
116  Matcher m = p.matcher(eventText);
117  if (m.find()) {
118  lsScriptsUsed.add(entry.getKey());
119  }
120  }
121  if (lsScriptsUsed.size() > 1) {
122  String eventRef = "event " + eventStart + "/" + eventEnd
123  + ", tier '" + tierId + "'";
124  // Highlight different scripts in different colors
125  String eventTextColored = "";
126  for (int iChar = 0; iChar < eventText.length(); ++iChar) {
127  boolean bScriptFound = false;
128  String curChar = eventText.substring(iChar, iChar + 1);
129  for (Map.Entry<String, Pattern> entry : dictScripts.entrySet()) {
130  Pattern p = entry.getValue();
131  Matcher m = p.matcher(curChar);
132  if (m.find()) {
133  eventTextColored += "<span class=\"char_"
134  + entry.getKey() + "\">" + curChar + "</span>";
135  bScriptFound = true;
136  break;
137  }
138  }
139  if (!bScriptFound) {
140  eventTextColored += curChar;
141  }
142  }
143  //Filename is added automatically so message can be shorter
144  String message = "Mixed scripts in \"" + eventTextColored
145  + "\" (" + String.join(", ", lsScriptsUsed) + "), "
146  + eventRef;
147  stats.addWarning(function, cd, message);
148  }
149  /*
150  else {
151  stats.addCorrect(function, cd, "ok");
152  }
153  */
154  }
155  }
156  return stats;
157  }
158 
159  @Override
160  public String getDescription() {
161  return "A functions that checks for mixed scripts (e.g. Cyrillic/Latin) in the transcription tiers of EXMARaLDA basic transcriptions and issues warnings if they are found";
162  }
163 
164  @Override
165  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
166  Report stats = new Report();
167  for (CorpusData cdata : c.getBasicTranscriptionData()) {
168  stats.merge(function(cdata, fix));
169  }
170  return stats;
171  }
172 }
void merge(Report sr)
Definition: Report.java:73
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
Definition: Report.java:164
void addException(Throwable e, String description)
Definition: Report.java:287
Collection< Class<?extends CorpusData > > getIsUsableFor()