9 package de.uni_hamburg.corpora.validation;
17 import java.io.IOException;
18 import java.io.UnsupportedEncodingException;
19 import java.util.Collection;
20 import java.util.HashMap;
21 import java.util.HashSet;
23 import java.util.List;
25 import javax.xml.parsers.ParserConfigurationException;
26 import javax.xml.transform.TransformerException;
27 import javax.xml.xpath.XPathExpressionException;
28 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
29 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
30 import org.exmaralda.partitureditor.jexmaralda.BasicBody;
31 import org.exmaralda.partitureditor.jexmaralda.Tier;
32 import org.jdom.JDOMException;
33 import org.jdom.xpath.XPath;
34 import org.xml.sax.SAXException;
44 String tierNameFormat =
"Tier %2$s (%1$s):";
45 String tierTextFormat =
"%s";
59 throws SAXException, JDOMException, IOException, JexmaraldaException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException {
61 Map<String, String> tiers =
new HashMap<String, String>();
62 tiers.put(
"akz",
"Accentuation/stress");
63 tiers.put(
"c",
"Indicates that the automatic pos-annotation is " 65 tiers.put(
"cs",
"Codeswitching");
66 tiers.put(
"CW",
"Annotation of code switching");
67 tiers.put(
"de",
"German translation");
68 tiers.put(
"disfluency",
"Disfluency");
69 tiers.put(
"en",
"English translation");
70 tiers.put(
"fe",
"English free translation");
71 tiers.put(
"fg",
"German free translation");
72 tiers.put(
"fr",
"Russian free translation");
73 tiers.put(
"fh",
"Hungarian free translation");
74 tiers.put(
"so",
"Source origin");
75 tiers.put(
"ge",
"Morphological annotation: " 76 +
"English gloss of each morpheme");
77 tiers.put(
"gr",
"Morphological annotation: " 78 +
"Russian gloss of each morpheme");
79 tiers.put(
"hd",
"Standard German translation");
80 tiers.put(
"IST",
"Annotation of information status");
81 tiers.put(
"k",
"Free Comment");
82 tiers.put(
"lang",
"Language of utterance");
83 tiers.put(
"lemma",
"Lemma");
84 tiers.put(
"mb",
"Morpheme break");
85 tiers.put(
"mc",
"Part of speech of each morpheme");
86 tiers.put(
"mp",
"Morphophonemes, underlying forms");
87 tiers.put(
"mT",
"Morphological transliteration");
88 tiers.put(
"nt",
"Notes on the text unit");
89 tiers.put(
"nv",
"Non-verbal");
90 tiers.put(
"pho-adult",
"Phonetic target structure");
91 tiers.put(
"pho",
"Phonetic annnotation");
92 tiers.put(
"pos",
"Part of Speech");
93 tiers.put(
"pos-sup",
"Superordinate part of Speech");
94 tiers.put(
"ps",
"Part of speech of each word");
95 tiers.put(
"ref",
"Name of the communication");
96 tiers.put(
"SeR",
"Annotation of semantic roles");
97 tiers.put(
"st",
"Source texts: normally in Cyrillic transliteration");
98 tiers.put(
"sup",
"suprasegmental information");
99 tiers.put(
"SyF",
"Annotation of syntactic function");
100 tiers.put(
"syll",
"Syllable structure");
101 tiers.put(
"ts",
"Transcription (what is heard)");
102 tiers.put(
"tx",
"Tier for interlinearization");
103 tiers.put(
"type",
"Type (spontaneous vs. imitated) of utterance");
104 tiers.put(
"word",
"Orthographic form of tokens");
106 tiers.put(
"v",
"Verbal");
107 tiers.put(
"no",
"Numbering");
108 tiers.put(
"anno",
"Anonymisation");
109 tiers.put(
"nn",
"Action by unspecified source");
111 Set<String> skipTiers =
new HashSet<String>();
112 skipTiers.add(
"COLUMN-LABEL");
113 skipTiers.add(
"ROW-LABEL");
114 skipTiers.add(
"SUB-ROW-LABEL");
115 skipTiers.add(
"EMPTY");
116 skipTiers.add(
"EMPTY-EDITOR");
119 comafile =
new File(cd.getURL().toString());
120 String str = comafile.getPath().substring(6);
121 org.jdom.Document corpus
122 = org.exmaralda.common.jdomutilities.IOUtilities.readDocumentFromLocalFile(
124 XPath xpCommunications = XPath.newInstance(
"//Communication");
125 List allCommunications = xpCommunications.selectNodes(corpus);
126 for (Object o : allCommunications) {
127 org.jdom.Element communication = (org.jdom.Element) o;
129 String communicationName = communication.getAttributeValue(
"Name");
131 XPath xpBasTrans = XPath.newInstance(
"Transcription[Description" 132 +
"/Key[@Name='segmented']/text()='false']");
133 List allBasTrans = xpBasTrans.selectNodes(communication);
134 for (Object oB : allBasTrans) {
135 org.jdom.Element basTrans = (org.jdom.Element) oB;
136 String relPath = basTrans.getChildText(
"NSLink");
137 String filePath = comafile.getParent() + File.separator
139 filePath = filePath.substring(6);
140 File file =
new File(filePath);
141 if (!file.isFile()) {
145 org.jdom.Element desc = basTrans.getChild(
"Description");
146 List keys = desc.getChildren(
"Key");
147 Set<String> addedTiers =
new HashSet<String>();
150 for (Object key : keys) {
151 org.jdom.Element keyElement = (org.jdom.Element) key;
152 if (keyElement.getAttributeValue(
"Name").startsWith(
"Tier")) {
153 int fIndex = keyElement.getAttributeValue(
"Name").indexOf(
" ");
154 int lIndex = keyElement.getAttributeValue(
"Name").lastIndexOf(
" ");
155 addedTiers.add(keyElement.getAttributeValue(
"Name").substring(fIndex + 1, lIndex));
158 BasicTranscription bt =
new BasicTranscription(filePath);
159 BasicBody bb = bt.getBody();
160 String[] tierIDs = bb.getAllTierIDs();
162 for (String tierID : tierIDs) {
163 if (skipTiers.contains(tierID)) {
165 "Skipped a tier: " + tierID,
166 "This tier does not need to be included in " 172 tier = bb.getTierWithID(tierID);
173 }
catch (JexmaraldaException je) {
174 System.out.println(
"ERRORR: tier with ID " + tierID
178 String displayName = tier.getDisplayName();
179 String category = tier.getCategory();
180 String tierType = tier.getType();
183 org.jdom.Element keyElement =
new org.jdom.Element(
"Key");
184 boolean alreadyAdded =
false;
185 for (String added : addedTiers) {
186 if (added.equals(category)) {
193 }
else if (tiers.containsKey(category)) {
194 String describeTierType =
"Unknown";
195 if (tierType.equals(
"a")) {
196 describeTierType =
"Annotation";
197 }
else if (tierType.equals(
"d")) {
198 describeTierType =
"Description";
199 }
else if (tierType.equals(
"t")) {
200 describeTierType =
"Transcription";
202 describeTierType =
"Unknown";
204 keyElement.setAttribute(
"Name",
205 String.format(tierNameFormat,
206 describeTierType, category));
207 keyElement.setText(String.format(tierTextFormat,
208 tiers.get(category)));
209 desc.addContent(keyElement);
210 stats.
addFix(
function, cd,
211 "Tier was missing from COMA: " 213 +
": The default description has been added.");
214 addedTiers.add(category);
217 "Unrecognised tier category: " 219 "Tier must be added manually to coma");
238 Class cl = Class.forName(
"de.uni_hamburg.corpora.ComaData");
240 }
catch (ClassNotFoundException ex) {
252 String description =
"This class loads coma data and for all communications adds " 253 +
"all tiers found in the linked exb as a key value pairs to the description. ";
258 public Report function(
Corpus c, Boolean fix)
throws SAXException, JDOMException, IOException, JexmaraldaException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException {
261 stats =
function(cdata, fix);
void addNote(String statId, String description)
Collection< Class<?extends CorpusData > > getIsUsableFor()
ComaAddTiersFromExbsCorrector()
void addWarning(String statId, String description)
void addException(Throwable e, String description)
void write(CorpusData cd, URL url)
void addFix(String statId, CorpusData cd, String description)