10 package de.uni_hamburg.corpora.validation;
20 import java.io.IOException;
22 import java.io.UnsupportedEncodingException;
23 import java.net.URISyntaxException;
25 import java.util.Collection;
26 import java.util.List;
27 import java.util.HashMap;
28 import java.util.HashSet;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31 import javax.xml.parsers.ParserConfigurationException;
32 import javax.xml.transform.TransformerException;
33 import javax.xml.xpath.XPathExpressionException;
35 import org.xml.sax.SAXException;
37 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
38 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
39 import org.exmaralda.partitureditor.jexmaralda.segment.AbstractSegmentation;
40 import org.exmaralda.partitureditor.fsm.FSMException;
41 import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
42 import org.jdom.Document;
43 import org.jdom.Element;
44 import org.jdom.JDOMException;
46 import org.exmaralda.coma.helpers.*;
55 static String filename;
56 static BasicTranscription bt;
59 AbstractSegmentation segmentation;
61 String segmentationName =
"GENERIC";
62 String path2ExternalFSM =
"";
70 public Report function(
CorpusData cd, Boolean fix)
throws SAXException, JDOMException, IOException, JexmaraldaException, FSMException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException, URISyntaxException {
74 if (segmentationName.equals(
"HIAT")) {
75 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.HIATSegmentation();
76 }
else if (segmentationName.equals(
"GAT")) {
77 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.GATSegmentation();
78 }
else if (segmentationName.equals(
"cGAT_MINIMAL")) {
79 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.cGATMinimalSegmentation();
80 }
else if (segmentationName.equals(
"CHAT")) {
81 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.CHATSegmentation();
82 }
else if (segmentationName.equals(
"CHAT_MINIMAL")) {
83 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.CHATMinimalSegmentation();
84 }
else if (segmentationName.equals(
"DIDA")) {
85 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.DIDASegmentation();
86 }
else if (segmentationName.equals(
"IPA")) {
87 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.IPASegmentation();
89 segmentation =
new org.exmaralda.partitureditor.jexmaralda.segment.GenericSegmentation();
91 if (!path2ExternalFSM.equals(
"")) {
92 segmentation.pathToExternalFSM = path2ExternalFSM;
98 SegmentedTranscription st = segmentation.BasicToSegmented(btd.
getEXMARaLDAbt());
99 st.setEXBSource(cd.getFilename());
102 org.exmaralda.partitureditor.jexmaralda.segment.SegmentCountForMetaInformation
105 URL url =
new URL(cd.getParentURL() + cd.getFilenameWithoutFileEnding() +
"_s.exs");
107 stats.
addFix(
function, cd,
"Exs successfully created at " + url);
109 stats.
addCorrect(
function, cd,
"No segmentation errors found with segmentation " + segmentationName);
113 FSMException fsme = (FSMException) o;
114 String text = fsme.getMessage();
116 exmaError.addError(
function, filename, fsme.getTierID(), fsme.getTLI(),
false, text);
130 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
132 }
catch (ClassNotFoundException ex) {
133 report.
addException(ex,
"unknown class not found error");
139 segmentationName = s;
143 path2ExternalFSM = s;
148 Element root = partitur.getRootElement();
150 HashMap<String, HashMap<String, String>> speakers =
new HashMap<String, HashMap<String, String>>();
151 HashMap<String, String> metadata =
new HashMap<String, String>();
152 Element metaInformation;
154 HashMap<String, String> machineTags =
new HashMap<String, String>();
155 HashSet<String> mediaFiles =
new HashSet<String>();
156 if ((root.getName().equals(
"basic-transcription"))
157 || (root.getName().equals(
"segmented-transcription"))) {
158 if (root.getAttributeValue(
"Id") == null) {
159 id =
"CID" +
new GUID().makeID();
160 root.setAttribute(
"Id",
id);
164 id = root.getAttributeValue(
"Id");
166 metaInformation = root.getChild(
"head").getChild(
168 metadata.put(
"project-name",
169 metaInformation.getChild(
"project-name").getText()
171 metadata.put(
"transcription-name",
172 metaInformation.getChild(
"transcription-name")
174 metadata.put(
"comment", metaInformation.getChild(
"comment")
176 metadata.put(
"transcription-convention", metaInformation
177 .getChild(
"transcription-convention").getText().trim());
178 for (Element e : (List<Element>) metaInformation
179 .getChildren(
"referenced-file")) {
180 if (e.getAttributeValue(
"url").length() > 0) {
181 mediaFiles.add(e.getAttributeValue(
"url"));
184 for (Element e : (List<Element>) metaInformation.getChild(
185 "ud-meta-information").getChildren()) {
186 if ((e.getAttributeValue(
"attribute-name").startsWith(
"#"))) {
187 machineTags.put(e.getAttributeValue(
"attribute-name")
188 .substring(2), e.getText().trim());
191 "ud_" + e.getAttributeValue(
"attribute-name"),
195 spkTable = (Element) root.getChild(
"head").getChild(
197 for (Element s : (List<Element>) spkTable.getChildren()) {
198 String sid =
"SID" +
new GUID().makeID();
199 speakers.put(sid,
new HashMap<String, String>());
200 speakers.get(sid).put(
"id", s.getAttributeValue(
"id"));
201 speakers.get(sid).put(
"@abbreviation",
202 s.getChildText(
"abbreviation"));
203 speakers.get(sid).put(
205 (s.getChild(
"sex").getAttributeValue(
"value")
206 .equals(
"m") ?
"male" :
"female"));
207 speakers.get(sid).put(
"@abbreviation",
208 s.getChildText(
"abbreviation"));
210 for (Element ul : (List<Element>) s.getChild(
211 "languages-used").getChildren()) {
213 metadata.put(
"@language-used-" + count,
214 ul.getAttributeValue(
"lang"));
217 for (Element l1e : (List<Element>) s.getChild(
"l1")
219 speakers.get(sid).put(
"@l1" + count,
220 l1e.getAttributeValue(
"lang"));
224 for (Element l2e : (List<Element>) s.getChild(
"l2")
226 speakers.get(sid).put(
"@l2" + count,
227 l2e.getAttributeValue(
"lang"));
232 for (Element udi : (List<Element>) s.getChild(
233 "ud-speaker-information").getChildren()) {
234 speakers.get(sid).put(
236 + udi.getAttributeValue(
237 "attribute-name").trim(),
242 if (s.getChild(
"comment").getText().length() > 0) {
243 speakers.get(sid).put(
"comment",
244 s.getChild(
"comment").getText());
257 String description =
"This class checks Exmaralda exb files for segmentation problems, returns the errors in the Report and in the ExmaErrors and if the fix option is specified it creates " 258 +
"segmented exs from the exbs that don't contain errors.";
263 public Report function(
Corpus c, Boolean fix)
throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException, JexmaraldaException, FSMException {
265 for (
CorpusData cdata : c.getBasicTranscriptionData()) {
266 stats.
merge(
function(cdata, fix));
void setExternalFSM(String s)
BasicTranscription getEXMARaLDAbt()
void setSegmentation(String s)
static ExmaErrorList exmaError
void addCritical(String description)
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addCorrect(String statId, String description)
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
void addException(Throwable e, String description)
Document setMetadataInformation(Document partitur)
void write(CorpusData cd, URL url)
void addFix(String statId, CorpusData cd, String description)