corpus-services  1.0
DuplicateTierContentChecker.java
Go to the documentation of this file.
1 
9 package de.uni_hamburg.corpora.validation;
10 
18 import java.io.IOException;
19 import java.io.UnsupportedEncodingException;
20 import java.net.URL;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collection;
24 import java.util.HashMap;
25 import java.util.List;
26 import javax.xml.parsers.ParserConfigurationException;
27 import javax.xml.transform.TransformerException;
28 import javax.xml.xpath.XPathExpressionException;
29 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
30 import org.xml.sax.SAXException;
31 import org.w3c.dom.Document;
32 import org.w3c.dom.Element;
33 import org.w3c.dom.NodeList;
34 import org.jdom.JDOMException;
35 import org.jdom.xpath.XPath;
36 import java.util.Map;
37 import java.util.regex.Pattern;
38 import java.security.MessageDigest;
39 import java.security.NoSuchAlgorithmException;
40 import java.math.BigInteger;
41 
46 public class DuplicateTierContentChecker extends Checker implements CorpusFunction {
47 
48  static final int MIN_TIER_LENGTH = 10; // tiers shorter than this will not be compared
49  ArrayList<String> lsTiersToCheck = new ArrayList<>(
50  Arrays.asList("ts", "tx", "fe", "fg", "fr"));
51  // This is the default list that can be overridden by calling setTierNames
52  Pattern rxClean = Pattern.compile("[ \r\n\t.,:;?!()\\[\\]/\\-{}<>*%=\"]",
53  Pattern.UNICODE_CHARACTER_CLASS);
54  Pattern rxApostrophe = Pattern.compile("[`‘’′́̀ʼ]", Pattern.UNICODE_CHARACTER_CLASS);
55  MessageDigest md = null;
56 
58  //no fixing option available
59  super(false);
60  }
61 
62  public void setTierNames(String sTiers) {
63  lsTiersToCheck = new ArrayList<>(Arrays.asList(sTiers.split(",")));
64  }
65 
69  public String normalize_tier(Element tier) {
70  String tierText = "";
71  NodeList events = tier.getElementsByTagName("event");
72  for (int j = 0; j < events.getLength(); j++) {
73  Element event = (Element) events.item(j);
74  String eventText = event.getTextContent();
75  tierText += eventText.toLowerCase();
76  }
77  tierText = rxClean.matcher(tierText).replaceAll("");
78  tierText = rxApostrophe.matcher(tierText).replaceAll("'");
79  if (tierText.length() <= MIN_TIER_LENGTH) {
80  return ""; // we don't want to compare empty or too short tiers
81  }
82  String hashText;
83  // Use short MD5 hashes instead of long strings
84  try {
85  byte[] tierBytes = tierText.getBytes("UTF-8");
86  byte[] md5Bytes = md.digest(tierBytes);
87  BigInteger bigInt = new BigInteger(1, md5Bytes);
88  hashText = bigInt.toString(16);
89  } catch (UnsupportedEncodingException ex) {
90  return tierText;
91  }
92  //System.out.println("hash: " + hashText);
93  return hashText;
94  }
95 
100  public Map<String, String> process_exb(CorpusData cd) {
101  Map<String, String> tierValues = new HashMap<>();
102  XMLData xml = (XMLData) cd;
103  Document doc = TypeConverter.JdomDocument2W3cDocument(xml.getJdom());
104 
105  NodeList tiers = doc.getElementsByTagName("tier"); // get all tiers of the transcript
106  ArrayList<Element> relevantTiers = new ArrayList();
107  for (int i = 0; i < tiers.getLength(); i++) {
108  Element tier = (Element) tiers.item(i);
109  String category = tier.getAttribute("category"); // get category so that we know is this is a relevant tier
110  if (lsTiersToCheck.contains(category)) {
111  if (tierValues.containsKey(category)) {
112  tierValues.put(category, tierValues.get(category) + normalize_tier(tier));
113  } else {
114  tierValues.put(category, normalize_tier(tier));
115  }
116  }
117  }
118  return tierValues;
119  }
120 
126  @Override
127  public Report function(CorpusData cd, Boolean fix) throws NoSuchAlgorithmException, TransformerException, ParserConfigurationException, SAXException, IOException, JDOMException, XPathExpressionException, JexmaraldaException, ClassNotFoundException {
128  System.out.println("Duplicate check started.");
129 
130  md = MessageDigest.getInstance("MD5");
131 
132  Report stats = new Report();
133  CorpusIO cio = new CorpusIO();
134  Map<String, HashMap<String, String>> tierValues = new HashMap<>();
135  for (String tierName : lsTiersToCheck) {
136  tierValues.put(tierName, new HashMap<String, String>());
137  }
138 
139  org.jdom.Document comaDoc = TypeConverter.String2JdomDocument(cd.toSaveableString());
140  XPath context;
141  context = XPath.newInstance("//Transcription[Description/Key[@Name='segmented']/text()='false']");
142  URL url;
143  List allContextInstances = context.selectNodes(comaDoc);
144  for (int i = 0; i < allContextInstances.size(); i++) {
145  Object o = allContextInstances.get(i);
146  if (o instanceof org.jdom.Element) {
147  org.jdom.Element e = (org.jdom.Element) o;
148  String sFilename = e.getChildText("NSLink");
149  System.out.println("NSLink: " + sFilename);
150  url = new URL(cd.getParentURL() + sFilename);
151  CorpusData exb = cio.readFileURL(url);
152  Map<String, String> curTierValues = process_exb(exb);
153  for (Map.Entry<String, String> entry : curTierValues.entrySet()) {
154  if (!tierValues.containsKey(entry.getKey())
155  || entry.getValue().length() <= 0) {
156  continue;
157  }
158  if (tierValues.get(entry.getKey()).containsKey(entry.getValue())) {
159  stats.addCritical(function, exb, "The file is a duplicate of "
160  + tierValues.get(entry.getKey()).get(entry.getValue())
161  + " (tier " + entry.getKey() + ").");
162  } else {
163  tierValues.get(entry.getKey()).put(entry.getValue(), exb.getFilename());
164  // Remember that this text for this tier was seen in this file
165  }
166  }
167  }
168  }
169 
170  return stats;
171  }
172 
178  @Override
179  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
180  try {
181  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
182  IsUsableFor.add(cl);
183  } catch (ClassNotFoundException ex) {
184  report.addException(ex, " usable class not found");
185  }
186  return IsUsableFor;
187  }
188 
193  @Override
194  public String getDescription() {
195  String description = "This class takes a coma file, reads all exbs"
196  + " linked there, reads them and checks if there are duplicate"
197  + " or near-duplicate exbs in the corpus.";
198  return description;
199  }
200 
201  @Override
202  public Report function(Corpus c, Boolean fix) throws NoSuchAlgorithmException, TransformerException, ParserConfigurationException, SAXException, IOException, JDOMException, XPathExpressionException, JexmaraldaException, ClassNotFoundException {
203  Report stats;
204  cd = c.getComaData();
205  stats = function(cd, fix);
206  return stats;
207  }
208 
209 }
CorpusData readFileURL(URL url, Collection< Class<?extends CorpusData >> clcds)
Definition: CorpusIO.java:125
void addCritical(String description)
Definition: Report.java:104
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
void addException(Throwable e, String description)
Definition: Report.java:287