corpus-services  1.0
ComaAddTiersFromExbsCorrector.java
Go to the documentation of this file.
1 
9 package de.uni_hamburg.corpora.validation;
10 
16 import java.io.File;
17 import java.io.IOException;
18 import java.io.UnsupportedEncodingException;
19 import java.util.Collection;
20 import java.util.HashMap;
21 import java.util.HashSet;
22 import java.util.Set;
23 import java.util.List;
24 import java.util.Map;
25 import javax.xml.parsers.ParserConfigurationException;
26 import javax.xml.transform.TransformerException;
27 import javax.xml.xpath.XPathExpressionException;
28 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
29 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
30 import org.exmaralda.partitureditor.jexmaralda.BasicBody;
31 import org.exmaralda.partitureditor.jexmaralda.Tier;
32 import org.jdom.JDOMException;
33 import org.jdom.xpath.XPath;
34 import org.xml.sax.SAXException;
35 
40 public class ComaAddTiersFromExbsCorrector extends Checker implements CorpusFunction {
41 
42  File comafile;
43  ValidatorSettings settings;
44  String tierNameFormat = "Tier %2$s (%1$s):";
45  String tierTextFormat = "%s";
46  String comaLoc = "";
47 
49  //can fix
50  super(true);
51  }
52 
57  @Override
58  public Report function(CorpusData cd, Boolean fix)
59  throws SAXException, JDOMException, IOException, JexmaraldaException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException {
60  CorpusIO cio = new CorpusIO();
61  Map<String, String> tiers = new HashMap<String, String>();
62  tiers.put("akz", "Accentuation/stress");
63  tiers.put("c", "Indicates that the automatic pos-annotation is "
64  + "incorrect");
65  tiers.put("cs", "Codeswitching");
66  tiers.put("CW", "Annotation of code switching");
67  tiers.put("de", "German translation");
68  tiers.put("disfluency", "Disfluency");
69  tiers.put("en", "English translation");
70  tiers.put("fe", "English free translation");
71  tiers.put("fg", "German free translation");
72  tiers.put("fr", "Russian free translation");
73  tiers.put("fh", "Hungarian free translation");
74  tiers.put("so", "Source origin");
75  tiers.put("ge", "Morphological annotation: "
76  + "English gloss of each morpheme");
77  tiers.put("gr", "Morphological annotation: "
78  + "Russian gloss of each morpheme");
79  tiers.put("hd", "Standard German translation");
80  tiers.put("IST", "Annotation of information status");
81  tiers.put("k", "Free Comment");
82  tiers.put("lang", "Language of utterance");
83  tiers.put("lemma", "Lemma");
84  tiers.put("mb", "Morpheme break");
85  tiers.put("mc", "Part of speech of each morpheme");
86  tiers.put("mp", "Morphophonemes, underlying forms");
87  tiers.put("mT", "Morphological transliteration");
88  tiers.put("nt", "Notes on the text unit");
89  tiers.put("nv", "Non-verbal");
90  tiers.put("pho-adult", "Phonetic target structure");
91  tiers.put("pho", "Phonetic annnotation");
92  tiers.put("pos", "Part of Speech");
93  tiers.put("pos-sup", "Superordinate part of Speech");
94  tiers.put("ps", "Part of speech of each word");
95  tiers.put("ref", "Name of the communication");
96  tiers.put("SeR", "Annotation of semantic roles");
97  tiers.put("st", "Source texts: normally in Cyrillic transliteration");
98  tiers.put("sup", "suprasegmental information");
99  tiers.put("SyF", "Annotation of syntactic function");
100  tiers.put("syll", "Syllable structure");
101  tiers.put("ts", "Transcription (what is heard)");
102  tiers.put("tx", "Tier for interlinearization");
103  tiers.put("type", "Type (spontaneous vs. imitated) of utterance");
104  tiers.put("word", "Orthographic form of tokens");
105  // These are NOT in the catalogue...?
106  tiers.put("v", "Verbal");
107  tiers.put("no", "Numbering");
108  tiers.put("anno", "Anonymisation");
109  tiers.put("nn", "Action by unspecified source");
110 
111  Set<String> skipTiers = new HashSet<String>();
112  skipTiers.add("COLUMN-LABEL");
113  skipTiers.add("ROW-LABEL");
114  skipTiers.add("SUB-ROW-LABEL");
115  skipTiers.add("EMPTY");
116  skipTiers.add("EMPTY-EDITOR");
117  Report stats = new Report();
118 
119  comafile = new File(cd.getURL().toString());
120  String str = comafile.getPath().substring(6);
121  org.jdom.Document corpus
122  = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(
123  str);
124  XPath xpCommunications = XPath.newInstance("//Communication");
125  List allCommunications = xpCommunications.selectNodes(corpus);
126  for (Object o : allCommunications) {
127  org.jdom.Element communication = (org.jdom.Element) o;
128  //retrieve the communication name
129  String communicationName = communication.getAttributeValue("Name");
130  //pick up basic transcriptions
131  XPath xpBasTrans = XPath.newInstance("Transcription[Description"
132  + "/Key[@Name='segmented']/text()='false']");
133  List allBasTrans = xpBasTrans.selectNodes(communication);
134  for (Object oB : allBasTrans) {
135  org.jdom.Element basTrans = (org.jdom.Element) oB;
136  String relPath = basTrans.getChildText("NSLink");
137  String filePath = comafile.getParent() + File.separator
138  + relPath;
139  filePath = filePath.substring(6);
140  File file = new File(filePath);
141  if (!file.isFile()) {
142  // we already checked validity of files in other checks
143  continue;
144  }
145  org.jdom.Element desc = basTrans.getChild("Description");
146  List keys = desc.getChildren("Key");
147  Set<String> addedTiers = new HashSet<String>();
148  // add tiers that are already in the coma file to the set so that they are not added to the coma file
149  // again from the exbs files
150  for (Object key : keys) {
151  org.jdom.Element keyElement = (org.jdom.Element) key;
152  if (keyElement.getAttributeValue("Name").startsWith("Tier")) {
153  int fIndex = keyElement.getAttributeValue("Name").indexOf(" ");
154  int lIndex = keyElement.getAttributeValue("Name").lastIndexOf(" ");
155  addedTiers.add(keyElement.getAttributeValue("Name").substring(fIndex + 1, lIndex));
156  }
157  }
158  BasicTranscription bt = new BasicTranscription(filePath);
159  BasicBody bb = bt.getBody();
160  String[] tierIDs = bb.getAllTierIDs();
161 
162  for (String tierID : tierIDs) {
163  if (skipTiers.contains(tierID)) {
164  stats.addNote(function,
165  "Skipped a tier: " + tierID,
166  "This tier does not need to be included in "
167  + "coma file");
168  continue;
169  }
170  Tier tier = null;
171  try {
172  tier = bb.getTierWithID(tierID);
173  } catch (JexmaraldaException je) {
174  System.out.println("ERRORR: tier with ID " + tierID
175  + " is lost...");
176  continue;
177  }
178  String displayName = tier.getDisplayName();
179  String category = tier.getCategory();
180  String tierType = tier.getType();
181 // System.out.println("DEBUG: id,disp,cat" +
182 // tierID + " , " + displayName + " , " + category);
183  org.jdom.Element keyElement = new org.jdom.Element("Key");
184  boolean alreadyAdded = false;
185  for (String added : addedTiers) {
186  if (added.equals(category)) {
187  // no need to add twice?
188  alreadyAdded = true;
189  }
190  }
191  if (alreadyAdded) {
192  continue;
193  } else if (tiers.containsKey(category)) {
194  String describeTierType = "Unknown";
195  if (tierType.equals("a")) {
196  describeTierType = "Annotation";
197  } else if (tierType.equals("d")) {
198  describeTierType = "Description";
199  } else if (tierType.equals("t")) {
200  describeTierType = "Transcription";
201  } else {
202  describeTierType = "Unknown";
203  }
204  keyElement.setAttribute("Name",
205  String.format(tierNameFormat,
206  describeTierType, category));
207  keyElement.setText(String.format(tierTextFormat,
208  tiers.get(category)));
209  desc.addContent(keyElement);
210  stats.addFix(function, cd,
211  "Tier was missing from COMA: "
212  + tierID
213  + ": The default description has been added.");
214  addedTiers.add(category);
215  } else {
216  stats.addWarning(function,
217  "Unrecognised tier category: "
218  + category,
219  "Tier must be added manually to coma");
220  }
221  }
222  }
223  }
224  if (fix) {
225  cio.write(corpus, settings.getOutputFile().toURI().toURL());
226  }
227  return stats;
228  }
229 
235  @Override
236  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
237  try {
238  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
239  IsUsableFor.add(cl);
240  } catch (ClassNotFoundException ex) {
241  report.addException(ex, " usable class not found");
242  }
243  return IsUsableFor;
244  }
245 
250  @Override
251  public String getDescription() {
252  String description = "This class loads coma data and for all communications adds "
253  + "all tiers found in the linked exb as a key value pairs to the description. ";
254  return description;
255  }
256 
257  @Override
258  public Report function(Corpus c, Boolean fix) throws SAXException, JDOMException, IOException, JexmaraldaException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException {
259  Report stats = new Report();
260  CorpusData cdata = c.getComaData();
261  stats = function(cdata, fix);
262  return stats;
263  }
264 
265 }
void addNote(String statId, String description)
Definition: Report.java:245
void addWarning(String statId, String description)
Definition: Report.java:164
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155