corpus-services  1.0
ExbSegmentationChecker.java
Go to the documentation of this file.
1 
10 package de.uni_hamburg.corpora.validation;
11 
20 import java.io.IOException;
21 import java.io.File;
22 import java.io.UnsupportedEncodingException;
23 import java.net.URISyntaxException;
24 import java.net.URL;
25 import java.util.Collection;
26 import java.util.List;
27 import java.util.HashMap;
28 import java.util.HashSet;
29 import java.util.logging.Level;
30 import java.util.logging.Logger;
31 import javax.xml.parsers.ParserConfigurationException;
32 import javax.xml.transform.TransformerException;
33 import javax.xml.xpath.XPathExpressionException;
34 
35 import org.xml.sax.SAXException;
36 
37 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
38 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
39 import org.exmaralda.partitureditor.jexmaralda.segment.AbstractSegmentation;
40 import org.exmaralda.partitureditor.fsm.FSMException;
41 import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
42 import org.jdom.Document;
43 import org.jdom.Element;
44 import org.jdom.JDOMException;
45 
46 import org.exmaralda.coma.helpers.*;
47 
53 public class ExbSegmentationChecker extends Checker implements CorpusFunction {
54 
55  static String filename;
56  static BasicTranscription bt;
57  static BasicTranscriptionData btd;
58  static File exbfile;
59  AbstractSegmentation segmentation;
60  static ValidatorSettings settings;
61  String segmentationName = "GENERIC";
62  String path2ExternalFSM = "";
63 
65  //fixing is possible
66  super(true);
67  }
68 
69  @Override
70  public Report function(CorpusData cd, Boolean fix) throws SAXException, JDOMException, IOException, JexmaraldaException, FSMException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException, URISyntaxException {
71  Report stats = new Report();
72 
73  btd = new BasicTranscriptionData(cd.getURL());
74  if (segmentationName.equals("HIAT")) {
75  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.HIATSegmentation();
76  } else if (segmentationName.equals("GAT")) {
77  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.GATSegmentation();
78  } else if (segmentationName.equals("cGAT_MINIMAL")) {
79  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.cGATMinimalSegmentation();
80  } else if (segmentationName.equals("CHAT")) {
81  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.CHATSegmentation();
82  } else if (segmentationName.equals("CHAT_MINIMAL")) {
83  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.CHATMinimalSegmentation();
84  } else if (segmentationName.equals("DIDA")) {
85  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.DIDASegmentation();
86  } else if (segmentationName.equals("IPA")) {
87  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.IPASegmentation();
88  } else {
89  segmentation = new org.exmaralda.partitureditor.jexmaralda.segment.GenericSegmentation();
90  }
91  if (!path2ExternalFSM.equals("")) {
92  segmentation.pathToExternalFSM = path2ExternalFSM;
93  }
94  CorpusIO cio = new CorpusIO();
95  List v = segmentation.getSegmentationErrors(btd.getEXMARaLDAbt());
96  if (v.isEmpty()) {
97  if (fix){
98  SegmentedTranscription st = segmentation.BasicToSegmented(btd.getEXMARaLDAbt());
99  st.setEXBSource(cd.getFilename());
100  //add the udMetadata!!!!
101  //finally found the missing method :) :)
102  org.exmaralda.partitureditor.jexmaralda.segment.SegmentCountForMetaInformation
103  .count(st);
104  Document doc = TypeConverter.String2JdomDocument(st.toXML());
105  URL url = new URL(cd.getParentURL() + cd.getFilenameWithoutFileEnding() + "_s.exs");
106  cio.write(doc, url);
107  stats.addFix(function, cd, "Exs successfully created at " + url);
108  } else{
109  stats.addCorrect(function, cd, "No segmentation errors found with segmentation " + segmentationName);
110  }
111  } else {
112  for (Object o : v) {
113  FSMException fsme = (FSMException) o;
114  String text = fsme.getMessage();
115  stats.addCritical(function, cd, text);
116  exmaError.addError(function, filename, fsme.getTierID(), fsme.getTLI(), false, text);
117  }
118  }
119  return stats;
120  }
121 
127  @Override
128  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
129  try {
130  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
131  IsUsableFor.add(cl);
132  } catch (ClassNotFoundException ex) {
133  report.addException(ex, "unknown class not found error");
134  }
135  return IsUsableFor;
136  }
137 
138  public void setSegmentation(String s) {
139  segmentationName = s;
140  }
141 
142  public void setExternalFSM(String s) {
143  path2ExternalFSM = s;
144  }
145 
146  public Document setMetadataInformation(Document partitur) {
147  //SEE org.exmaralda.coma.models.TranscriptionMetadata
148  Element root = partitur.getRootElement();
149  String id = "";
150  HashMap<String, HashMap<String, String>> speakers = new HashMap<String, HashMap<String, String>>();
151  HashMap<String, String> metadata = new HashMap<String, String>();
152  Element metaInformation;
153  Element spkTable;
154  HashMap<String, String> machineTags = new HashMap<String, String>();
155  HashSet<String> mediaFiles = new HashSet<String>();
156  if ((root.getName().equals("basic-transcription"))
157  || (root.getName().equals("segmented-transcription"))) {
158  if (root.getAttributeValue("Id") == null) {
159  id = "CID" + new GUID().makeID();
160  root.setAttribute("Id", id);
161  // attribut machen
162  // speichern
163  } else {
164  id = root.getAttributeValue("Id");
165  }
166  metaInformation = root.getChild("head").getChild(
167  "meta-information");
168  metadata.put("project-name",
169  metaInformation.getChild("project-name").getText()
170  .trim());
171  metadata.put("transcription-name",
172  metaInformation.getChild("transcription-name")
173  .getText());
174  metadata.put("comment", metaInformation.getChild("comment")
175  .getText());
176  metadata.put("transcription-convention", metaInformation
177  .getChild("transcription-convention").getText().trim());
178  for (Element e : (List<Element>) metaInformation
179  .getChildren("referenced-file")) {
180  if (e.getAttributeValue("url").length() > 0) {
181  mediaFiles.add(e.getAttributeValue("url"));
182  }
183  }
184  for (Element e : (List<Element>) metaInformation.getChild(
185  "ud-meta-information").getChildren()) {
186  if ((e.getAttributeValue("attribute-name").startsWith("#"))) {
187  machineTags.put(e.getAttributeValue("attribute-name")
188  .substring(2), e.getText().trim());
189  } else {
190  metadata.put(
191  "ud_" + e.getAttributeValue("attribute-name"),
192  e.getText().trim());
193  }
194  }
195  spkTable = (Element) root.getChild("head").getChild(
196  "speakertable");
197  for (Element s : (List<Element>) spkTable.getChildren()) {
198  String sid = "SID" + new GUID().makeID();
199  speakers.put(sid, new HashMap<String, String>());
200  speakers.get(sid).put("id", s.getAttributeValue("id"));
201  speakers.get(sid).put("@abbreviation",
202  s.getChildText("abbreviation"));
203  speakers.get(sid).put(
204  "@sex",
205  (s.getChild("sex").getAttributeValue("value")
206  .equals("m") ? "male" : "female"));
207  speakers.get(sid).put("@abbreviation",
208  s.getChildText("abbreviation"));
209  int count = 0;
210  for (Element ul : (List<Element>) s.getChild(
211  "languages-used").getChildren()) {
212  count++;
213  metadata.put("@language-used-" + count,
214  ul.getAttributeValue("lang"));
215  }
216  count = 0;
217  for (Element l1e : (List<Element>) s.getChild("l1")
218  .getChildren()) {
219  speakers.get(sid).put("@l1" + count,
220  l1e.getAttributeValue("lang"));
221  count++;
222  }
223  count = 0;
224  for (Element l2e : (List<Element>) s.getChild("l2")
225  .getChildren()) {
226  speakers.get(sid).put("@l2" + count,
227  l2e.getAttributeValue("lang"));
228 
229  count++;
230  }
231 
232  for (Element udi : (List<Element>) s.getChild(
233  "ud-speaker-information").getChildren()) {
234  speakers.get(sid).put(
235  "ud_"
236  + udi.getAttributeValue(
237  "attribute-name").trim(),
238  udi.getText());
239 
240  // }
241  }
242  if (s.getChild("comment").getText().length() > 0) {
243  speakers.get(sid).put("comment",
244  s.getChild("comment").getText());
245  }
246  }
247  }
248  return partitur;
249  }
250 
255  @Override
256  public String getDescription() {
257  String description = "This class checks Exmaralda exb files for segmentation problems, returns the errors in the Report and in the ExmaErrors and if the fix option is specified it creates "
258  + "segmented exs from the exbs that don't contain errors.";
259  return description;
260  }
261 
262  @Override
263  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException, JexmaraldaException, FSMException {
264  Report stats = new Report();
265  for (CorpusData cdata : c.getBasicTranscriptionData()) {
266  stats.merge(function(cdata, fix));
267  }
268  return stats;
269  }
270 }
void merge(Report sr)
Definition: Report.java:73
void addCritical(String description)
Definition: Report.java:104
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addCorrect(String statId, String description)
Definition: Report.java:217
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155