9 package de.uni_hamburg.corpora.validation;
18 import java.io.IOException;
19 import java.io.UnsupportedEncodingException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.HashMap;
25 import java.util.List;
26 import javax.xml.parsers.ParserConfigurationException;
27 import javax.xml.transform.TransformerException;
28 import javax.xml.xpath.XPathExpressionException;
29 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
30 import org.xml.sax.SAXException;
31 import org.w3c.dom.Document;
32 import org.w3c.dom.Element;
33 import org.w3c.dom.NodeList;
34 import org.jdom.JDOMException;
35 import org.jdom.xpath.XPath;
37 import java.util.regex.Pattern;
38 import java.security.MessageDigest;
39 import java.security.NoSuchAlgorithmException;
40 import java.math.BigInteger;
48 static final int MIN_TIER_LENGTH = 10;
49 ArrayList<String> lsTiersToCheck =
new ArrayList<>(
50 Arrays.asList(
"ts",
"tx",
"fe",
"fg",
"fr"));
52 Pattern rxClean = Pattern.compile(
"[ \r\n\t.,:;?!()\\[\\]/\\-{}<>*%=\"]",
53 Pattern.UNICODE_CHARACTER_CLASS);
54 Pattern rxApostrophe = Pattern.compile(
"[`‘’′́̀ʼ]", Pattern.UNICODE_CHARACTER_CLASS);
55 MessageDigest md = null;
63 lsTiersToCheck =
new ArrayList<>(Arrays.asList(sTiers.split(
",")));
71 NodeList events = tier.getElementsByTagName(
"event");
72 for (
int j = 0; j < events.getLength(); j++) {
73 Element
event = (Element) events.item(j);
74 String eventText =
event.getTextContent();
75 tierText += eventText.toLowerCase();
77 tierText = rxClean.matcher(tierText).replaceAll(
"");
78 tierText = rxApostrophe.matcher(tierText).replaceAll(
"'");
79 if (tierText.length() <= MIN_TIER_LENGTH) {
85 byte[] tierBytes = tierText.getBytes(
"UTF-8");
86 byte[] md5Bytes = md.digest(tierBytes);
87 BigInteger bigInt =
new BigInteger(1, md5Bytes);
88 hashText = bigInt.toString(16);
89 }
catch (UnsupportedEncodingException ex) {
101 Map<String, String> tierValues =
new HashMap<>();
105 NodeList tiers = doc.getElementsByTagName(
"tier");
106 ArrayList<Element> relevantTiers =
new ArrayList();
107 for (
int i = 0; i < tiers.getLength(); i++) {
108 Element tier = (Element) tiers.item(i);
109 String category = tier.getAttribute(
"category");
110 if (lsTiersToCheck.contains(category)) {
111 if (tierValues.containsKey(category)) {
112 tierValues.put(category, tierValues.get(category) +
normalize_tier(tier));
127 public Report function(
CorpusData cd, Boolean fix)
throws NoSuchAlgorithmException, TransformerException, ParserConfigurationException, SAXException, IOException, JDOMException, XPathExpressionException, JexmaraldaException, ClassNotFoundException {
128 System.out.println(
"Duplicate check started.");
130 md = MessageDigest.getInstance(
"MD5");
134 Map<String, HashMap<String, String>> tierValues =
new HashMap<>();
135 for (String tierName : lsTiersToCheck) {
136 tierValues.put(tierName,
new HashMap<String, String>());
141 context = XPath.newInstance(
"//Transcription[Description/Key[@Name='segmented']/text()='false']");
143 List allContextInstances = context.selectNodes(comaDoc);
144 for (
int i = 0; i < allContextInstances.size(); i++) {
145 Object o = allContextInstances.get(i);
146 if (o instanceof org.jdom.Element) {
147 org.jdom.Element e = (org.jdom.Element) o;
148 String sFilename = e.getChildText(
"NSLink");
149 System.out.println(
"NSLink: " + sFilename);
150 url =
new URL(cd.getParentURL() + sFilename);
152 Map<String, String> curTierValues =
process_exb(exb);
153 for (Map.Entry<String, String> entry : curTierValues.entrySet()) {
154 if (!tierValues.containsKey(entry.getKey())
155 || entry.getValue().length() <= 0) {
158 if (tierValues.get(entry.getKey()).containsKey(entry.getValue())) {
159 stats.
addCritical(
function, exb,
"The file is a duplicate of " 160 + tierValues.get(entry.getKey()).
get(entry.getValue())
161 +
" (tier " + entry.getKey() +
").");
163 tierValues.get(entry.getKey()).put(entry.getValue(), exb.getFilename());
181 Class cl = Class.forName(
"de.uni_hamburg.corpora.ComaData");
183 }
catch (ClassNotFoundException ex) {
195 String description =
"This class takes a coma file, reads all exbs" 196 +
" linked there, reads them and checks if there are duplicate" 197 +
" or near-duplicate exbs in the corpus.";
202 public Report function(
Corpus c, Boolean fix)
throws NoSuchAlgorithmException, TransformerException, ParserConfigurationException, SAXException, IOException, JDOMException, XPathExpressionException, JexmaraldaException, ClassNotFoundException {
204 cd = c.getComaData();
205 stats =
function(cd, fix);
Collection< Class<?extends CorpusData > > getIsUsableFor()
CorpusData readFileURL(URL url, Collection< Class<?extends CorpusData >> clcds)
String normalize_tier(Element tier)
void setTierNames(String sTiers)
void addCritical(String description)
Map< String, String > process_exb(CorpusData cd)
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
DuplicateTierContentChecker()
void addException(Throwable e, String description)