9 package de.uni_hamburg.corpora.validation;
17 import java.io.IOException;
18 import java.net.URISyntaxException;
19 import java.util.ArrayList;
20 import java.util.Arrays;
21 import java.util.Collection;
22 import javax.xml.parsers.ParserConfigurationException;
23 import javax.xml.transform.TransformerException;
24 import javax.xml.xpath.XPathExpressionException;
25 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
26 import org.jdom.JDOMException;
27 import org.xml.sax.SAXException;
28 import java.util.regex.Matcher;
29 import java.util.regex.Pattern;
30 import org.w3c.dom.Document;
31 import org.w3c.dom.Element;
32 import org.w3c.dom.NodeList;
33 import java.util.HashMap;
40 ArrayList<String> lsTiersToCheck =
new ArrayList<>(
41 Arrays.asList(
"tx",
"mb",
"mp",
"ge"));
44 static String sCharClassLat =
"[a-zÀ-žḀ-ỹ]";
45 static String sCharClassCyr =
"[Ѐ-ԯ]";
46 static String sCharClassGreek =
"[΄-ϡϰ-Ͽἀ-῾]";
47 static String sCharClassArmenian =
"[Ա-֏]";
48 static String sCharClassGeorgian =
"[\u10a0-\u10ff]";
49 Pattern rxLat = Pattern.compile(sCharClassLat, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
50 Pattern rxCyr = Pattern.compile(sCharClassCyr, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
51 Pattern rxGreek = Pattern.compile(sCharClassGreek, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
52 Pattern rxArmenian = Pattern.compile(sCharClassArmenian, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
53 Pattern rxGeorgian = Pattern.compile(sCharClassGeorgian, Pattern.CASE_INSENSITIVE | Pattern.UNICODE_CHARACTER_CLASS);
54 Map<String, Pattern> dictScripts =
new HashMap<>();
59 dictScripts.put(
"Cyrillic", rxCyr);
60 dictScripts.put(
"Latin", rxLat);
61 dictScripts.put(
"Greek", rxGreek);
62 dictScripts.put(
"Armenian", rxArmenian);
63 dictScripts.put(
"Georgian", rxGeorgian);
75 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
77 }
catch (ClassNotFoundException ex) {
92 NodeList tiers = doc.getElementsByTagName(
"tier");
93 ArrayList<Element> relevantTiers =
new ArrayList();
94 for (
int i = 0; i < tiers.getLength(); i++) {
95 Element tier = (Element)tiers.item(i);
96 String category = tier.getAttribute(
"category");
97 if (lsTiersToCheck.contains(category)) {
98 relevantTiers.add(tier);
101 for (
int i = 0; i < relevantTiers.size(); i++) {
102 Element curTier = relevantTiers.get(i);
103 NodeList events = curTier.getElementsByTagName(
"event");
104 String tierId = curTier.getAttribute(
"id");
105 String tierSpeaker = curTier.getAttribute(
"speaker");
108 for (
int j = 0; j < events.getLength(); j++) {
109 Element
event = (Element)events.item(j);
110 String eventStart =
event.getAttribute(
"start");
111 String eventEnd =
event.getAttribute(
"end");
112 String eventText =
event.getTextContent();
113 ArrayList<String> lsScriptsUsed =
new ArrayList<>();
114 for (Map.Entry<String, Pattern> entry : dictScripts.entrySet()) {
115 Pattern p = entry.getValue();
116 Matcher m = p.matcher(eventText);
118 lsScriptsUsed.add(entry.getKey());
121 if (lsScriptsUsed.size() > 1) {
122 String eventRef =
"event " + eventStart +
"/" + eventEnd
123 +
", tier '" + tierId +
"'";
125 String eventTextColored =
"";
126 for (
int iChar = 0; iChar < eventText.length(); ++iChar) {
127 boolean bScriptFound =
false;
128 String curChar = eventText.substring(iChar, iChar + 1);
129 for (Map.Entry<String, Pattern> entry : dictScripts.entrySet()) {
130 Pattern p = entry.getValue();
131 Matcher m = p.matcher(curChar);
133 eventTextColored +=
"<span class=\"char_" 134 + entry.getKey() +
"\">" + curChar +
"</span>";
140 eventTextColored += curChar;
144 String message =
"Mixed scripts in \"" + eventTextColored
145 +
"\" (" + String.join(
", ", lsScriptsUsed) +
"), " 161 return "A functions that checks for mixed scripts (e.g. Cyrillic/Latin) in the transcription tiers of EXMARaLDA basic transcriptions and issues warnings if they are found";
165 public Report function(
Corpus c, Boolean fix)
throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException {
167 for (
CorpusData cdata : c.getBasicTranscriptionData()) {
168 stats.
merge(
function(cdata, fix));
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
void addException(Throwable e, String description)
Collection< Class<?extends CorpusData > > getIsUsableFor()