corpus-services  1.0
ComaSegmentCountChecker.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora.validation;
2 
12 import java.util.ArrayList;
13 import java.io.IOException;
14 import java.net.URL;
15 import java.util.Collection;
16 import java.util.List;
17 import java.util.regex.Pattern;
18 import javax.xml.parsers.ParserConfigurationException;
19 import javax.xml.transform.TransformerException;
20 import javax.xml.xpath.XPathExpressionException;
21 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
22 import org.jdom.JDOMException;
23 import org.jdom.xpath.XPath;
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Element;
26 import org.w3c.dom.NodeList;
27 import org.xml.sax.SAXException;
28 
33 public class ComaSegmentCountChecker extends Checker implements CorpusFunction {
34 
35  String comaLoc = "";
36 
38  //fixing is available
39  super(true);
40  }
41 
47  @Override
48  public Report function(CorpusData cd, Boolean fix) throws ClassNotFoundException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
49  Report stats = new Report(); //create a new report
50  ComaData comad = (ComaData) cd;
51  org.jdom.Document comaDoc = comad.getJdom();
52  Document doc = JdomDocument2W3cDocument(comaDoc);
53  NodeList communications = doc.getElementsByTagName("Communication"); // divide by Communication tags
54  ArrayList<String> algorithmNames = new ArrayList<>(); // array for holding algorithm names
55  CorpusIO cio = new CorpusIO();
57  if (fix) {
58  List<org.jdom.Element> toRemove = new ArrayList<org.jdom.Element>();
59  XPath context;
60  context = XPath.newInstance("//Transcription[Description/Key[@Name='segmented']/text()='true']");
61  URL url;
62  List allContextInstances = context.selectNodes(comaDoc);
63  if (!allContextInstances.isEmpty()) {
64  for (int i = 0; i < allContextInstances.size(); i++) {
65  Object o = allContextInstances.get(i);
66  if (o instanceof org.jdom.Element) {
67  org.jdom.Element e = (org.jdom.Element) o;
68  List<org.jdom.Element> descKeys;
69  //in the coma file remove old stats first
70  descKeys = e.getChild("Description")
71  .getChildren();
72  for (org.jdom.Element ke : (List<org.jdom.Element>) descKeys) {
73  if (Pattern.matches("#(..).*", ke.getAttributeValue("Name"))) {
74  toRemove.add(ke);
75  }
76  }
77  for (org.jdom.Element re : toRemove) {
78  descKeys.remove(re);
79  }
80  //now get the new segment counts and add them insted
81  String s = e.getChildText("NSLink");
82  //System.out.println("NSLink:" + s);
83  url = new URL(cd.getParentURL() + s);
84  exs = (SegmentedTranscriptionData) cio.readFileURL(url);
85  List segmentCounts = exs.getSegmentCounts();
86  for (Object segmentCount : segmentCounts) {
87  if (segmentCount instanceof org.jdom.Element) {
88  org.jdom.Element segmentCountEl = (org.jdom.Element) segmentCount;
89  //Object key = segmentCountEl.getAttributeValue("attribute-name").substring(2);
90  Object key = segmentCountEl.getAttributeValue("attribute-name");
91  Object value = segmentCountEl.getValue();
92  //System.out.println("Value:" + value);
93  org.jdom.Element newKey = new org.jdom.Element("Key");
94  newKey.setAttribute("Name", (String) key);
95  newKey.setText(value.toString());
96  e.getChild("Description").addContent(
97  newKey);
98  report.addFix(function, cd, "Updated segment count " + key.toString() + ":" + value.toString() + "for transcription " + e.getAttributeValue("Name"));
99  }
100  }
101 
102  }
103  }
104  }
105  if (comaDoc != null) {
106  cd.updateUnformattedString(TypeConverter.JdomDocument2String(comaDoc));
107  cio.write(cd, cd.getURL());
108  report.addCorrect(function, cd, "Updated the segment counts!");
109  } else {
110  report.addCritical(function, cd, "Updating the segment counts was not possible!");
111  }
112  } //still check it now after they were added
113  for (int i = 0; i < communications.getLength(); i++) { //iterate through communications
114  Element communication = (Element) communications.item(i);
115  NodeList transcriptions = communication.getElementsByTagName("Transcription"); // get transcriptions of current communication
116  for (int j = 0; j < transcriptions.getLength(); j++) { // iterate through transcriptions
117  Element transcription = (Element) transcriptions.item(j);
118  //System.out.println("Transcription: " + transcription.getAttribute("Id"));
119  NodeList descriptions = transcription.getElementsByTagName("Description");
120  for (int d = 0; d < descriptions.getLength(); d++) {
121  Element description = (Element) descriptions.item(d);
122  NodeList keys = description.getElementsByTagName("Key");
123  // get keys of current transcription description
124  //we need to look for the key "Description" containing the "Key" Element with the segmented attribute
125  for (int k = 0; k < keys.getLength(); k++) { // look for the key with "segmented" attribute
126  Element key = (Element) keys.item(k);
127  //System.out.println("Key: " + key.getAttribute("Name"));
128  //System.out.println(key.getAttribute("Name").contains("#"));
129  //System.out.println(key.getAttribute("Name").contains(":"));
130  if (key.getAttribute("Name").contains("#") && key.getAttribute("Name").contains(":")) {
131  String text = key.getAttribute("Name");
132  //System.out.println(text);
133  int colonIndex = key.getAttribute("Name").lastIndexOf(':');
134  int hashIndex = key.getAttribute("Name").indexOf('#');
135  algorithmNames.add(key.getAttribute("Name").substring(hashIndex + 2, colonIndex));
136  }
137  }
138  break;
139  }
140 
141  }
142  }
143  //System.out.println(algorithmNames);
144  String algorithmName = "";
145  if (!algorithmNames.isEmpty()) {
146  algorithmName = algorithmNames.get(0);
147  boolean error = false;
148  for (int i = 1; i < algorithmNames.size(); i++) { // check if coma file contains different seg algorithms
149  if (!algorithmName.equals(algorithmNames.get(i))) {
150  error = true;
151  System.out.println("Coma file contains different segmentation algorithms: " + algorithmNames.get(i));
152  stats.addCritical(function, cd, "More than one segmentation algorithm: " + algorithmNames.get(i) + " and " + algorithmName);
153  break;
154  }
155  }
156  if (!error) {
157  stats.addCorrect(function, cd, "Only segmentation " + algorithmNames.get(1));
158  }
159  } else {
160  stats.addWarning(function, cd, "No segment counts added yet. Use Coma > Maintenance > Update segment counts to add them. ");
161  }
162  return stats;
163  }
164 
170  @Override
171  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
172  try {
173  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
174  IsUsableFor.add(cl);
175  } catch (ClassNotFoundException ex) {
176  report.addException(ex, "Usable class not found.");
177  }
178  return IsUsableFor;
179  }
180 
185  @Override
186  public String getDescription() {
187  String description = "This class checks whether there are more than one "
188  + "segmentation algorithms used in the coma file. If that is the case"
189  + ", it issues warnings. If it ihas the fix option, it updates the segment counts from the exbs. ";
190  return description;
191  }
192 
193  @Override
194  public Report function(Corpus c, Boolean fix) throws ClassNotFoundException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
195  Report stats;
196  cd = c.getComaData();
197  stats = function(cd, fix);
198  return stats;
199  }
200 }
void addCritical(String description)
Definition: Report.java:104
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
Definition: Report.java:164
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addCorrect(String statId, String description)
Definition: Report.java:217
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
Definition: Report.java:287
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155