hzsk-corpus-services  1.0
EXB2INELISOTEI.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.conversion;
7 
15 import java.io.IOException;
16 import java.util.HashSet;
17 import java.util.Hashtable;
18 import java.util.List;
19 import java.util.Vector;
20 //TODO get rid of emxa imports in the future
21 import org.exmaralda.common.jdomutilities.IOUtilities;
22 import org.exmaralda.partitureditor.fsm.FSMException;
23 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
24 import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
26 import java.io.InputStream;
27 import java.net.MalformedURLException;
28 import java.net.URL;
29 import java.nio.file.Paths;
30 import org.exmaralda.partitureditor.jexmaralda.segment.HIATSegmentation;
31 import org.jdom.Attribute;
32 import org.jdom.Document;
33 import org.jdom.Element;
34 import org.jdom.JDOMException;
35 import org.jdom.Namespace;
36 import org.jdom.Text;
37 import org.jdom.xpath.XPath;
38 import org.xml.sax.SAXException;
39 import java.util.*;
40 import javax.xml.parsers.ParserConfigurationException;
41 import javax.xml.transform.TransformerException;
42 import javax.xml.xpath.XPathExpressionException;
43 import org.exmaralda.common.corpusbuild.TextFilter;
44 
49 public class EXB2INELISOTEI extends Converter implements CorpusFunction {
50 
51  //copied partly from exmaralda\src\org\exmaralda\partitureditor\jexmaralda\convert\TEIConverter.java
52  //TODO - how to get the language for INEL?
53  String language = "en";
54 
55  final String function = "inel iso tei";
56 
57  //locations of the used xsls
58  static String TEI_SKELETON_STYLESHEET_ISO = "/xsl/EXMARaLDA2ISOTEI_Skeleton.xsl";
59  static String SC_TO_TEI_U_STYLESHEET_ISO = "/xsl/SegmentChain2ISOTEIUtteranceINEL.xsl";
60  static String SORT_AND_CLEAN_STYLESHEET_ISO = "/xsl/ISOTEICleanAndSortINEL.xsl";
61  static String FSM = "/xsl/INEL_Segmentation_FSM.xml";
62 
63  static String BODY_NODE = "//text";
64 
65  //the default tier where the morpheme segmentation is located
66  String XPath2Morphemes = "/basic-transcription/basic-body/tier[@id = \"mb\"]";
67  //Name of deep segmentation
68  String nameOfDeepSegmentation = "SpeakerContribution_Utterance_Word";
69  String nameOfFlategmentation = "SpeakerContribution_Event";
70 
71  //transformers for three transformations
72  XSLTransformer transformer;
73  XSLTransformer transformer2;
74  XSLTransformer transformer3;
75 
76  CorpusIO cio = new CorpusIO();
77 
78  public EXB2INELISOTEI() {
79  super("EXB2INELISOTEI");
80  }
81 
82  /*
83  * this method takes a CorpusData object, converts it into INBEL Morpheme ISO TEI and saves it
84  * next to the CorpusData object
85  * and gives back a report how it worked
86  */
87  public Report function(CorpusData cd) {
88  return convertCD2MORPHEMEHIATISOTEI(cd, true, XPath2Morphemes);
89  }
90 
91  /*
92  * this method takes a CorpusData object, the info if the fulltext is used, and an individual String where the morpheme segmentation
93  * is located as xpath,
94  * converts it into ISO TEI and saves it next to cd with _tei.xml
95  * and gives back a report if it worked or with errors
96  */
98  boolean includeFullText, String XPath2Morphemes) {
99  try {
100  //we create a BasicTranscription form the CorpusData
102  BasicTranscription bt = btd.getEXMARaLDAbt();
103  //normalize the exb (!)
104  bt.normalize();
105  System.out.println((cd.getURL()).getFile());
106  System.out.println("started writing document...");
107  //HIAT Segmentation
108  //TODO need to be a parameter in the future
109  //we need to give it the path to the custom INEL fsm for the segmentation
110  HIATSegmentation segmentation;
111  if (FSM!=null){
112  //reading the FSM and writing it to TEMP folder because Exmaralda Segmentation only takes an external path
113  InputStream is = getClass().getResourceAsStream(FSM);
114  String fsmstring = TypeConverter.InputStream2String(is);
115  URL url = Paths.get(System.getProperty("java.io.tmpdir")+ "/" + "fsmstring.xml").toUri().toURL();
116  cio.write(fsmstring, url);
117  segmentation = new HIATSegmentation(url.getFile());
118  }
119  else {
120  //default HIAT segmentation
121  segmentation = new HIATSegmentation();
122  }
123 
124  //create a segmented exs
125  SegmentedTranscription st = segmentation.BasicToSegmented(bt);
126  System.out.println("Segmented transcription created");
127  //Document from segmented transcription string
128  Document stdoc = TypeConverter.String2JdomDocument(st.toXML());
129  //TODO paramter in the future for deep & flat segmentation name
130  //MAGIC - now the real work happens
131  Document teiDoc = SegmentedTranscriptionToTEITranscription(stdoc,
132  nameOfDeepSegmentation,
133  nameOfFlategmentation,
134  includeFullText, cd);
135  if (teiDoc != null) {
136  System.out.println("Merged");
137  //so is the language of the doc
138  setDocLanguage(teiDoc, language);
139  //now the completed document is saved next to cd
140  String filename = cd.getURL().getFile();
141  URL url = new URL("file://" + filename.substring(0, filename.lastIndexOf(".")) + "_tei.xml");
142  cio.write(teiDoc, url);
143 
144  System.out.println("document written.");
145  report.addCorrect(function, cd, "ISO TEI conversion of file was successful");
146  } else {
147  report.addCritical(function, cd, "ISO TEI conversion of file was not possible because of unknown error");
148  }
149 
150  } catch (SAXException ex) {
151  report.addException(ex, function, cd, "Unknown exception error");
152  } catch (FSMException ex) {
153  report.addException(ex, function, cd, "Unknown finite state machine error");
154  } catch (MalformedURLException ex) {
155  report.addException(ex, function, cd, "Unknown file URL reading error");
156  } catch (JDOMException ex) {
157  report.addException(ex, function, cd, "Unknown file reading error");
158  } catch (IOException ex) {
159  report.addException(ex, function, cd, "Unknown file reading error");
160  } catch (TransformerException ex) {
161  report.addException(ex, function, cd, "XSL transformer error");
162  } catch (ParserConfigurationException ex) {
163  report.addException(ex, function, cd, "Parser error");
164  } catch (XPathExpressionException ex) {
165  report.addException(ex, function, cd, "XPath error");
166  }
167  return report;
168  }
169 
170  public Document SegmentedTranscriptionToTEITranscription(Document segmentedTranscription,
171  String nameOfDeepSegmentation,
172  String nameOfFlatSegmentation,
173  boolean includeFullText, CorpusData cd) throws JDOMException, IOException, TransformerException {
174 
175  Document finalDocument = null;
176  String skeleton_stylesheet = cio.readInternalResourceAsString(TEI_SKELETON_STYLESHEET_ISO);
177 
178  String transform_stylesheet = cio.readInternalResourceAsString(SC_TO_TEI_U_STYLESHEET_ISO);
179 
180  String sort_and_clean_stylesheet = cio.readInternalResourceAsString(SORT_AND_CLEAN_STYLESHEET_ISO);
181 
182  Document teiDocument = null;
183 
184  XSLTransformer xslt = new XSLTransformer();
185  //transform wants an xml as string object and xsl as String Object
186  //System.out.println(skeleton_stylesheet);
187  String result
188  = xslt.transform(TypeConverter.JdomDocument2String(segmentedTranscription), skeleton_stylesheet);
189  if (result != null) {
190  //now we get a document of the first transformation, the iso tei skeleton
191  teiDocument = TypeConverter.String2JdomDocument(result);
192  System.out.println("STEP 1 completed.");
193  /*
194  * this method will take the segmented transcription and, for each speaker
195  * contribution in the segmentation with the name 'nameOfDeepSegmentation'
196  * will add anchors from the segmentation with the name
197  * 'nameOfFlatSegmentation' such that the temporal information provided in
198  * the flat segmentation is completely represented as anchors within the
199  * deep segmentation. The typical application scenario is to give this
200  * method a segmented HIAT transcription with nameOfDeepSegmentation =
201  * 'SpeakerContribution_Utterance_Word' nameOfFlatSegmentation =
202  * 'SpeakerContribution_Event'
203  */
204  //We would also like to keep the FlatSegmentation as an annotation to display it correctly
205  //TO DO
206  Vector uElements = TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation, includeFullText);
207 
208  XPath xp = XPath.newInstance(BODY_NODE);
209  BODY_NODE = "//tei:body";
210  xp = XPath.newInstance(BODY_NODE);
211  xp.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
212 
213  Element textNode = (Element) (xp.selectSingleNode(teiDocument));
214  textNode.addContent(uElements);
215  if (teiDocument != null) {
216  System.out.println("STEP 2 completed.");
217 
218  Document transformedDocument = null;
219  String result2
220  = xslt.transform(TypeConverter.JdomDocument2String(teiDocument), transform_stylesheet);
221  transformedDocument = IOUtilities.readDocumentFromString(result2);
222  if (transformedDocument != null) {
223  //fix for issue #89
224  textNode = (Element) (xp.selectSingleNode(transformedDocument));
225  System.out.println("STEP 3 completed.");
226  // now take care of the events from tiers of type 'd'
227  XPath xp2 = XPath.newInstance("//segmentation[@name='Event']/ats");
228  List events = xp2.selectNodes(segmentedTranscription);
229  for (int pos = 0; pos < events.size(); pos++) {
230  Element exmaraldaEvent = (Element) (events.get(pos));
231  String category = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue("category");
232 
233  String elementName = "event";
234  if (category.equals("pause")) {
235  elementName = "pause";
236  }
237 
238  Element teiEvent = new Element(elementName);
239 
240  String speakerID = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue("speaker");
241  if (speakerID != null) {
242  teiEvent.setAttribute("who", speakerID);
243  }
244  teiEvent.setAttribute("start", exmaraldaEvent.getAttributeValue("s"));
245  teiEvent.setAttribute("end", exmaraldaEvent.getAttributeValue("e"));
246  if (!category.equals("pause")) {
247  teiEvent.setAttribute("desc", exmaraldaEvent.getText());
248  teiEvent.setAttribute("type", category);
249  } else {
250  String duration = exmaraldaEvent.getText().replaceAll("\\(", "").replaceAll("\\)", "");
251  teiEvent.setAttribute("dur", duration);
252  }
253  textNode.addContent(teiEvent);
254  }
255 
256  //for morpheme inel iso tei, sort and clean must be changed
257  //and the generating of the ids
258  generateWordIDs(transformedDocument);
259  if (transformedDocument != null) {
260  //Here the annotations are taken care of
261  //this is important for the INEL morpheme segmentations
262  //for the INEL transformation, the word IDs are generated earlier
263  String result3
264  = xslt.transform(TypeConverter.JdomDocument2String(transformedDocument), sort_and_clean_stylesheet);
265  if (result3 != null) {
266  finalDocument = IOUtilities.readDocumentFromString(result3);
267  if (finalDocument != null) {
268  }
269  }
270  }
271  }
272  }
273  }
274 
275  return finalDocument;
276  }
277 
278  public static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation) {
279  return TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation, false);
280  }
281 
300  public static Vector TEIMerge(Document segmentedTranscription,
301  String nameOfDeepSegmentation,
302  String nameOfFlatSegmentation,
303  boolean includeFullText) {
304  try {
305 
306  // Make a map of the timeline
307  Hashtable timelineItems = new Hashtable();
308  String xpath = "//tli";
309  XPath xpx = XPath.newInstance(xpath);
310  List tlis = xpx.selectNodes(segmentedTranscription);
311  for (int pos = 0; pos < tlis.size(); pos++) {
312 
313  timelineItems.put(((Element) (tlis.get(pos))).getAttributeValue("id"), pos);
314  }
315 
316  Vector returnValue = new Vector();
317  XPath xp1 = XPath.newInstance("//segmentation[@name='" + nameOfDeepSegmentation + "']/ts");
318  List segmentChains = xp1.selectNodes(segmentedTranscription);
319  // go through all top level segment chains
320  for (Object segmentChain : segmentChains) {
321  Element sc = (Element) (segmentChain);
322  sc.setAttribute("speaker", sc.getParentElement().getParentElement().getAttributeValue("speaker"));
323  String tierref = sc.getParentElement().getAttributeValue("tierref");
324  String start = sc.getAttributeValue("s");
325  String end = sc.getAttributeValue("e");
326  String xpath2 = "//segmentation[@name='" + nameOfFlatSegmentation + "' and @tierref='" + tierref + "']"
327  + "/ts[@s='" + start + "' and @e='" + end + "']";
328  XPath xp2 = XPath.newInstance(xpath2);
329  Element sc2 = (Element) (xp2.selectSingleNode(segmentedTranscription));
330  if (sc2 == null) {
331  //this means that no corresponding top level
332  //element was found in the second segmentation
333  //which should not happen
334  throw new Exception(tierref + " " + start + " " + end);
335  }
336  // this is where the magic happens
337  Element mergedElement = merge(sc, sc2);
338  int s = ((Integer) (timelineItems.get(start)));
339  int e = ((Integer) (timelineItems.get(end)));
340  //We would also like to keep the FlatSegmentation as an annotation to display it correctly
341  //TO DO
342  String xpath3 = "//segmentation[@name='" + nameOfFlatSegmentation + "' and @tierref='" + tierref + "']"
343  + "/ts[@s='" + start + "' and @e='" + end + "']/ts";
344  XPath xp3 = XPath.newInstance(xpath3);
345  List transannos = xp3.selectNodes(segmentedTranscription);
346  for (Object transanno1 : transannos) {
347  Element transanno = (Element) transanno1;
348  String transaStart = transanno.getAttributeValue("s");
349  String transaEnd = transanno.getAttributeValue("e");
350  int transas = ((Integer) (timelineItems.get(transaStart)));
351  int transae = ((Integer) (timelineItems.get(transaEnd)));
352  boolean transannotationBelongsToThisElement = (transas >= s && transas <= e) || (transae >= s && transae <= e);
353  if (transannotationBelongsToThisElement) {
354  Element annotationsElement = mergedElement.getChild("annotations");
355  if (annotationsElement == null) {
356  annotationsElement = new Element("annotations");
357  mergedElement.addContent(annotationsElement);
358  }
359  Element annotation = new Element("annotation");
360  annotation.setAttribute("start", transaStart);
361  annotation.setAttribute("end", transaEnd);
362  annotation.setAttribute("level", transanno.getParentElement().getParentElement().getAttributeValue("name"));
363  annotation.setAttribute("value", transanno.getText());
364  annotationsElement.addContent(annotation);
365  }
366  }
367  // now take care of the corresponding annotations
368  String xpath5 = "//segmented-tier[@id='" + tierref + "']/annotation/ta";
369  XPath xp5 = XPath.newInstance(xpath5);
370  List annotations = xp5.selectNodes(segmentedTranscription);
371  for (Object annotation1 : annotations) {
372  Element anno = (Element) (annotation1);
373  String aStart = anno.getAttributeValue("s");
374  String aEnd = anno.getAttributeValue("e");
375  int as = ((Integer) (timelineItems.get(aStart)));
376  int ae = ((Integer) (timelineItems.get(aEnd)));
377  boolean annotationBelongsToThisElement = (as >= s && as <= e) || (ae >= s && ae <= e);
378  if (annotationBelongsToThisElement) {
379  Element annotationsElement = mergedElement.getChild("annotations");
380  if (annotationsElement == null) {
381  annotationsElement = new Element("annotations");
382  mergedElement.addContent(annotationsElement);
383  }
384  Element annotation = new Element("annotation");
385  annotation.setAttribute("start", aStart);
386  annotation.setAttribute("end", aEnd);
387  annotation.setAttribute("level", anno.getParentElement().getAttributeValue("name"));
388  annotation.setAttribute("value", anno.getText());
389  annotationsElement.addContent(annotation);
390  }
391 
392  //System.out.println(s + "/" + e + " **** " + as + "/" + ae);
393  }
394 
395  //*****************************************
396  // NEW 25-04-2016
397  // include full text if Daniel J. wisheth thus
398  if (includeFullText) {
399  Element annotation = new Element("annotation");
400  annotation.setAttribute("start", start);
401  annotation.setAttribute("end", end);
402  annotation.setAttribute("level", "full-text");
403 
404  String fullText = "";
405  List l = XPath.selectNodes(sc2, "descendant::text()");
406  for (Object o : l) {
407  Text text = (Text) o;
408  fullText += text.getText();
409  }
410  annotation.setAttribute("value", fullText);
411 
412  Element annotationsElement = mergedElement.getChild("annotations");
413  if (annotationsElement == null) {
414  annotationsElement = new Element("annotations");
415  mergedElement.addContent(annotationsElement);
416  }
417  annotationsElement.addContent(annotation);
418  }
419  //*****************************************
420 
421  returnValue.addElement(mergedElement.detach());
422  }
423 
424  // issue #89 - Now the vector contains elements only from the
425  // segmentations passed as parameters
426  // in particular, it seems that tiers of type 'd' (which end up as
427  // segmentation @name='Event' are lost
428  return returnValue;
429  } catch (JDOMException ex) {
430  ex.printStackTrace();
431  } catch (Exception ex) {
432  ex.printStackTrace();
433  }
434  return null;
435  }
436 
437  static Element merge(Element e1, Element e2) {
438 
439  Iterator i1 = e1.getDescendants();
440  Vector pcData1 = new Vector();
441  while (i1.hasNext()) {
442  pcData1.addElement(i1.next());
443  }
444 
445  Iterator i2 = e2.getDescendants(new TextFilter());
446  Vector pcData2 = new Vector();
447  while (i2.hasNext()) {
448  pcData2.addElement(i2.next());
449  }
450 
451  int charBoundary = 0;
452  for (int pos = 0; pos < pcData2.size() - 1; pos++) {
453  Text eventText = (Text) (pcData2.elementAt(pos));
454  Element anchor = new Element("anchor");
455  Element event = eventText.getParentElement();
456  String start = event.getAttributeValue("e");
457  anchor.setAttribute("synch", start);
458 
459  charBoundary += eventText.getText().length();
460  // jetzt durch den anderen Baum laufen und den zugehoerigen Anker
461  // an der richtigen Stelle einfuegen
462  int charCount = 0;
463  for (int pos2 = 0; pos2 < pcData1.size(); pos2++) {
464  Object o = pcData1.elementAt(pos2);
465  if (!(o instanceof Text)) {
466  continue;
467  }
468  Text segmentText = (Text) o;
469  int textLength = segmentText.getText().length();
470  if (charCount + textLength < charBoundary) {
471  charCount += textLength;
472  continue;
473  } else if (charCount + textLength == charBoundary) {
474  Element parent = segmentText.getParentElement();
475  int index = parent.indexOf(segmentText);
476  Element parentOfParent = parent.getParentElement();
477  int index2 = parentOfParent.indexOf(parent);
478  parentOfParent.addContent(index2 + 1, anchor);
479  break;
480  }
481  // charCount+textLength>charBoundary
482  String leftPart = segmentText.getText().substring(0, charBoundary - charCount);
483  String rightPart = segmentText.getText().substring(charBoundary - charCount);
484  Text leftText = new Text(leftPart);
485  Text rightText = new Text(rightPart);
486 
487  // neue Sachen muessen zweimal eingefuegt werden - einmal
488  // in den Vector, einmal in den Parent
489  // Sachen im Vector muessen den richtigen Parent bekommen
490  Element parent = segmentText.getParentElement();
491  parent.removeContent(segmentText);
492  parent.addContent(leftText);
493  parent.addContent(anchor);
494  parent.addContent(rightText);
495 
496  pcData1.remove(segmentText);
497  pcData1.add(pos2, rightText);
498  pcData1.add(pos2, anchor);
499  pcData1.add(pos2, leftText);
500  break;
501  }
502  }
503 
504  return e1;
505  }
506 
507  //new 30-03-2016
508  //this needed to be adapted to morpheme ids - and changed for the word IDs too
509  //and we need to generate the spans for the morphemes somewhere too
510  private void generateWordIDs(Document document) throws JDOMException {
511  // added 30-03-2016
512  HashSet<String> allExistingIDs = new HashSet<String>();
513  XPath idXPath = XPath.newInstance("//tei:*[@xml:id]");
514  idXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
515  idXPath.addNamespace(Namespace.XML_NAMESPACE);
516  List idElements = idXPath.selectNodes(document);
517  for (Object o : idElements) {
518  Element e = (Element) o;
519  allExistingIDs.add(e.getAttributeValue("id", Namespace.XML_NAMESPACE));
520  }
521 
522  // changed 30-03-2016
523  XPath wordXPath = XPath.newInstance("//tei:w[not(@xml:id)]");
524  wordXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
525  wordXPath.addNamespace(Namespace.XML_NAMESPACE);
526 
527  List words = wordXPath.selectNodes(document);
528  int count = 1;
529  for (Object o : words) {
530  Element word = (Element) o;
531  while (allExistingIDs.contains("w" + Integer.toString(count))) {
532  count++;
533  }
534 
535  String wordID = "w" + Integer.toString(count);
536  allExistingIDs.add(wordID);
537  //System.out.println("*** " + wordID);
538  word.setAttribute("id", wordID, Namespace.XML_NAMESPACE);
539  }
540 
541  // new 02-12-2014
542  XPath pcXPath = XPath.newInstance("//tei:pc[not(@xml:id)]");
543  pcXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
544  pcXPath.addNamespace(Namespace.XML_NAMESPACE);
545 
546  List pcs = pcXPath.selectNodes(document);
547  count = 1;
548  for (Object o : pcs) {
549  Element pc = (Element) o;
550  while (allExistingIDs.contains("pc" + Integer.toString(count))) {
551  count++;
552  }
553 
554  String pcID = "pc" + Integer.toString(count);
555  allExistingIDs.add(pcID);
556  //System.out.println("*** " + wordID);
557  pc.setAttribute("id", pcID, Namespace.XML_NAMESPACE);
558  }
559 
560  // we also need this for events/incidents
561  XPath incXPath = XPath.newInstance("//tei:event[not(@xml:id)]");
562  pcXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
563  pcXPath.addNamespace(Namespace.XML_NAMESPACE);
564 
565  List incs = incXPath.selectNodes(document);
566  count = 1;
567  for (Object o : incs) {
568  Element pc = (Element) o;
569  while (allExistingIDs.contains("inc" + Integer.toString(count))) {
570  count++;
571  }
572 
573  String incID = "inc" + Integer.toString(count);
574  allExistingIDs.add(incID);
575  //System.out.println("*** " + wordID);
576  pc.setAttribute("id", incID, Namespace.XML_NAMESPACE);
577  }
578 
579  // we also need this for seg elements
580  XPath segXPath = XPath.newInstance("//tei:seg[not(@xml:id)]");
581  pcXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
582  pcXPath.addNamespace(Namespace.XML_NAMESPACE);
583 
584  List segs = segXPath.selectNodes(document);
585  count = 1;
586  for (Object o : segs) {
587  Element seg = (Element) o;
588  while (allExistingIDs.contains("seg" + Integer.toString(count))) {
589  count++;
590  }
591 
592  String segID = "seg" + Integer.toString(count);
593  allExistingIDs.add(segID);
594  //System.out.println("*** " + wordID);
595  seg.setAttribute("id", segID, Namespace.XML_NAMESPACE);
596  }
597  }
598 
599  //TODO
600  private void setDocLanguage(Document teiDoc, String language) throws JDOMException {
601  // /TEI/text[1]/@*[namespace-uri()='http://www.w3.org/XML/1998/namespace' and local-name()='lang']
602  XPath xpathToLangAttribute = XPath.newInstance("//tei:text/@xml:lang");
603  xpathToLangAttribute.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
604  xpathToLangAttribute.addNamespace(Namespace.XML_NAMESPACE);
605  Attribute langAtt = (Attribute) xpathToLangAttribute.selectSingleNode(teiDoc);
606  if (langAtt != null) {
607  langAtt.setValue(language);
608  } else {
609  XPath xpathToTextElement = XPath.newInstance("//tei:text");
610  xpathToTextElement.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
611  xpathToTextElement.addNamespace(Namespace.XML_NAMESPACE);
612  Element textEl = (Element) xpathToTextElement.selectSingleNode(teiDoc);
613  textEl.setAttribute("lang", language, Namespace.XML_NAMESPACE);
614  }
615  System.out.println("Language of document set to " + language);
616 
617  }
618 
619  public void setLanguage(String lang) {
620  language = lang;
621  }
622 
623  @Override
624  public Report check(CorpusData cd) {
625  //convert the file
626  //save the converted file
627  //TODO
628  //doesn't really make sense to have check only here
629  report = fix(cd);
630  return report;
631  }
632 
633  @Override
634  public Report fix(CorpusData cd) {
635  //convert the file
636  //save the converted file
637  //String for filename where it should be written
638  //better be a URL?
639  report = new Report();
640  report = function(cd);
641  return report;
642  }
643 
644  @Override
645  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
646  try {
647  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
648  IsUsableFor.add(cl);
649  } catch (ClassNotFoundException ex) {
650  report.addException(ex, "unknown class not found error");
651  }
652  return IsUsableFor;
653  }
654 
655 
656  @Override
657  public String getDescription() {
658  String description = "This class takes an exb as input and converts it into ISO standard TEI format. ";
659  return description;
660  }
661 
662  @Override
663  public Report execute(Corpus c, boolean fix) {
664  throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
665  }
666 
667 }
Collection< Class<?extends CorpusData > > getIsUsableFor()
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText)
void addCritical(String description)
Definition: Report.java:104
String readInternalResourceAsString(String path2resource)
Definition: CorpusIO.java:183
static String InputStream2String(InputStream is)
Report convertCD2MORPHEMEHIATISOTEI(CorpusData cd, boolean includeFullText, String XPath2Morphemes)
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation)
void addCorrect(String statId, String description)
Definition: Report.java:217
Document SegmentedTranscriptionToTEITranscription(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText, CorpusData cd)
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:63