corpus-services  1.0
EXB2HIATISOTEI.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.conversion;
7 
15 import java.io.IOException;
16 import java.util.HashSet;
17 import java.util.Hashtable;
18 import java.util.List;
19 import java.util.Vector;
20 //TODO get rid of emxa imports in the future
21 import org.exmaralda.common.jdomutilities.IOUtilities;
22 import org.exmaralda.partitureditor.fsm.FSMException;
23 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
24 import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
26 import java.io.UnsupportedEncodingException;
27 import java.net.MalformedURLException;
28 import java.net.URISyntaxException;
29 import java.net.URL;
30 import org.exmaralda.partitureditor.jexmaralda.segment.HIATSegmentation;
31 import org.jdom.Attribute;
32 import org.jdom.Document;
33 import org.jdom.Element;
34 import org.jdom.JDOMException;
35 import org.jdom.Namespace;
36 import org.jdom.Text;
37 import org.jdom.transform.XSLTransformException;
38 import org.jdom.xpath.XPath;
39 import org.xml.sax.SAXException;
40 import java.util.*;
42 import javax.xml.parsers.ParserConfigurationException;
43 import javax.xml.transform.TransformerException;
44 import javax.xml.xpath.XPathExpressionException;
45 import org.exmaralda.common.corpusbuild.FileIO;
46 import org.exmaralda.common.corpusbuild.TextFilter;
47 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
48 
57 public class EXB2HIATISOTEI extends Converter implements CorpusFunction {
58 
59  //copied partly from exmaralda\src\org\exmaralda\partitureditor\jexmaralda\convert\TEIConverter.java
60  String language = "en";
61 
62  //locations of the used xsls
63  static String TEI_SKELETON_STYLESHEET_ISO = "/xsl/EXMARaLDA2ISOTEI_Skeleton.xsl";
64  static String SC_TO_TEI_U_STYLESHEET_ISO = "/xsl/SegmentChain2ISOTEIUtterance.xsl";
65  static String SORT_AND_CLEAN_STYLESHEET_ISO = "/xsl/ISOTEICleanAndSort.xsl";
66  static String TIME2TOKEN_SPAN_REFERENCES = "/xsl/time2tokenSpanReferences.xsl";
67  static String REMOVE_TIME = "/xsl/removeTimepointsWithoutAbsolute.xsl";
68  static String SPANS2_ATTRIBUTES = "/xsl/spans2attributes.xsl";
69 
70  static String FSM = "";
71 
72  static String BODY_NODE = "//text";
73 
74  //the default tier where the morpheme segmentation is located
75  String XPath2Morphemes = "/basic-transcription/basic-body/tier[@id = \"mb\"]";
76  //Name of deep segmentation
77  String nameOfDeepSegmentation = "SpeakerContribution_Utterance_Word";
78  String nameOfFlategmentation = "SpeakerContribution_Event";
79 
80  //transformers for three transformations
81  XSLTransformer transformer;
82  XSLTransformer transformer2;
83  XSLTransformer transformer3;
84 
85  CorpusIO cio = new CorpusIO();
86 
87  //debugging
88  //String intermediate1 = "file:///home/anne/Schreibtisch/TEI/intermediate1.xml";
89  //String intermediate2 = "file:///home/anne/Schreibtisch/TEI/intermediate2.xml";
90  //String intermediate3 = "file:///home/anne/Schreibtisch/TEI/intermediate3.xml";
91  //String intermediate4 = "file:///home/anne/Schreibtisch/TEI/intermediate4.xml";
92  //String intermediate5 = "file:///home/anne/Schreibtisch/TEI/intermediate5.xml";
93  static Boolean INEL = false;
94  static Boolean TOKEN = false;
95  Boolean COMA = false;
96 
97  URL cdURL;
98 
99 
100  /*
101  * this method takes a CorpusData object, converts it into HIAT ISO TEI and saves it
102  * next to the CorpusData object
103  * and gives back a report how it worked
104  */
116  public Report function(CorpusData cd) throws SAXException,
117  FSMException,
118  XSLTransformException,
119  JDOMException,
120  IOException,
121  Exception {
122  //it cannot be a coma file alone
124  }
125 
126  public Report function(Corpus c) throws SAXException,
127  FSMException,
128  XSLTransformException,
129  JDOMException,
130  IOException,
131  Exception {
132  COMA = true;
133  ComaData comad = c.getComaData();
134  return convertCOMA2MORPHEMEHIATISOTEI(comad);
135  }
136 
137  public Report convertCOMA2MORPHEMEHIATISOTEI(CorpusData cd) throws ClassNotFoundException {
138  Report stats = new Report();
139  try {
140  /*
141  Following Code is based on Code from Thomas
142  https://gitlab.rrz.uni-hamburg.de/Bae2551/ids-sample/blob/master/src/java/scripts/ConvertHAMATAC.java
143  */
144  // read COMA doc
145  Namespace teiNamespace = Namespace.getNamespace("tei", "http://www.tei-c.org/ns/1.0");
146  Document comaDoc = FileIO.readDocumentFromLocalFile(cd.getURL().getPath());
147  // select communication elements in COMA xml
148  List<Element> communicationsList = XPath.selectNodes(comaDoc, "//Communication");
149  // iterate through communications
150  for (Element communicationElement : communicationsList) {
151  // select basic transcriptions
152  List<Element> transcriptionsList = XPath.selectNodes(communicationElement, "descendant::Transcription[ends-with(Filename,'.exb')]");
153  // iterate through basic transcriptions
154  for (Element transcriptionElement : transcriptionsList) {
155  String transcriptID = transcriptionElement.getAttributeValue("Id");
156  String nsLink = transcriptionElement.getChildText("NSLink");
157  //choose exb fullPath
158  String fullPath = cd.getParentURL() + "/" + nsLink;
159  URL exburl = new URL(fullPath);
160  //now use the method to get the iso tei version from the exb file
161  CorpusData cdc = cio.readFileURL(exburl);
162  Document stdoc = cd2SegmentedTranscription(cdc);
163  Document finalDoc = SegmentedTranscriptionToTEITranscription(stdoc,
164  nameOfDeepSegmentation,
165  nameOfFlategmentation,
166  false, cd);
167  //now add the coma id information
168  // <idno type="AGD-ID">FOLK_E_00011_SE_01_T_04_DF_01</idno>
169  Element transcriptIdnoElement = new Element("idno", teiNamespace);
170  transcriptIdnoElement.setAttribute("type", "HZSK-ID");
171  transcriptIdnoElement.setText(transcriptID);
172  finalDoc.getRootElement().addContent(0, transcriptIdnoElement);
173 
174  XPath xp1 = XPath.newInstance("//tei:person");
175  xp1.addNamespace(teiNamespace);
176  List<Element> personL = xp1.selectNodes(finalDoc);
177  for (Element personE : personL) {
178  // <person xml:id="SPK0" n="Sh" sex="2">
179  String personSigle = personE.getAttributeValue("n");
180  String xp2 = "//Speaker[Sigle='" + personSigle + "']";
181  Element speakerE = (Element) XPath.selectSingleNode(comaDoc, xp2);
182  String speakerID = speakerE.getAttributeValue("Id");
183  Element speakerIdnoElement = new Element("idno", teiNamespace);
184  speakerIdnoElement.setAttribute("type", "HZSK-ID");
185  speakerIdnoElement.setText(speakerID);
186  personE.addContent(0, speakerIdnoElement);
187 
188  }
189  if (finalDoc != null) {
190  System.out.println("Merged");
191  //so is the language of the doc
192  setDocLanguage(finalDoc, language);
193  //now the completed document is saved
194  //TODO save next to the old cd
195  String filename = cdc.getURL().getFile();
196  URL url = new URL("file://" + filename.substring(0, filename.lastIndexOf(".")) + "_tei.xml");
197  System.out.println(url.toString());
198  cio.write(finalDoc, url);
199  System.out.println("document written.");
200  stats.addCorrect(function, cdc, "ISO TEI conversion of file was successful");
201  } else {
202  stats.addCritical(function, cdc, "ISO TEI conversion of file was not possible because of unknown error");
203  }
204 
205  }
206  }
207 
208  } catch (SAXException ex) {
209  stats.addException(ex, function, cd, "Unknown exception error");
210  } catch (FSMException ex) {
211  stats.addException(ex, function, cd, "Unknown finite state machine error");
212  } catch (MalformedURLException ex) {
213  stats.addException(ex, function, cd, "Unknown file URL reading error");
214  } catch (JDOMException ex) {
215  stats.addException(ex, function, cd, "Unknown file reading error");
216  } catch (IOException ex) {
217  stats.addException(ex, function, cd, "Unknown file reading error");
218  } catch (TransformerException ex) {
219  stats.addException(ex, function, cd, "XSL transformer error");
220  } catch (ParserConfigurationException ex) {
221  stats.addException(ex, function, cd, "Parser error");
222  } catch (XPathExpressionException ex) {
223  stats.addException(ex, function, cd, "XPath error");
224  } catch (URISyntaxException ex) {
225  stats.addException(ex, function, cd, "ComaPath URI error");
226  } catch (JexmaraldaException ex) {
227  stats.addException(ex, function, cd, "Jexmeaalda error");
228  }
229  return stats;
230  }
231 
232  public Report convertEXB2MORPHEMEHIATISOTEI(CorpusData cd) throws SAXException, FSMException, JDOMException, IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException, URISyntaxException {
233  if (INEL) {
234  TOKEN = true;
235  return convertEXB2MORPHEMEHIATISOTEI(cd, true, XPath2Morphemes);
236  } else {
237  return convertEXB2MORPHEMEHIATISOTEI(cd, false, XPath2Morphemes);
238  }
239  }
240 
241  /*
242  * this method takes a CorpusData object, the info if the fulltext is used, and an individual String where the morpheme segmentation
243  * is located as xpath,
244  * converts it into ISO TEI and saves it TODO where
245  * and gives back a report if it worked
246  */
248  boolean includeFullText, String XPath2Morphemes) throws SAXException, FSMException, JDOMException, IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException, URISyntaxException {
249  Report stats = new Report();
250  Document stdoc = cd2SegmentedTranscription(cd);
251  //TODO paramter in the future for deep & flat segmentation name
252  //MAGIC - now the real work happens
253  Document teiDoc = SegmentedTranscriptionToTEITranscription(stdoc,
254  nameOfDeepSegmentation,
255  nameOfFlategmentation,
256  includeFullText, cd);
257  if (teiDoc != null) {
258  System.out.println("Merged");
259  //so is the language of the doc
260  setDocLanguage(teiDoc, language);
261  //now the completed document is saved
262  //TODO save next to the old cd
263  String filename = cd.getURL().getFile();
264  URL url = new URL("file://" + filename.substring(0, filename.lastIndexOf(".")) + "_tei.xml");
265  System.out.println(url.toString());
266  cio.write(teiDoc, url);
267  System.out.println("document written.");
268  stats.addCorrect(function, cd, "ISO TEI conversion of file was successful");
269  } else {
270  stats.addCritical(function, cd, "ISO TEI conversion of file was not possible because of unknown error");
271  }
272 
273  return stats;
274  }
275 
276  public Document cd2SegmentedTranscription(CorpusData cd) throws SAXException, FSMException {
277  //we create a BasicTranscription form the CorpusData
279  BasicTranscription bt = btd.getEXMARaLDAbt();
280  //normalize the exb (!)
281  bt.normalize();
282  System.out.println((cd.getURL()).getFile());
283  System.out.println("started writing document...");
284  //HIAT Segmentation
285  HIATSegmentation segmentation = new HIATSegmentation();
286  /*
287  //reading the internal FSM and writing it to TEMP folder because Exmaralda Segmentation only takes an external path
288  InputStream is = getClass().getResourceAsStream(FSM);
289  String fsmstring = TypeConverter.InputStream2String(is);
290  URL url = Paths.get(System.getProperty("java.io.tmpdir") + "/" + "fsmstring.xml").toUri().toURL();
291  cio.write(fsmstring, url);
292  segmentation = new HIATSegmentation(url.getFile());
293  */
294  //default HIAT segmentation
295  if (!FSM.equals("")) {
296  segmentation.pathToExternalFSM = FSM;
297  }
298  //create a segmented exs
299  SegmentedTranscription st = segmentation.BasicToSegmented(bt);
300  System.out.println("Segmented transcription created");
301  //Document from segmented transcription string
302  Document stdoc = TypeConverter.String2JdomDocument(st.toXML());
303  return stdoc;
304  }
305 
306  public Document SegmentedTranscriptionToTEITranscription(Document segmentedTranscription,
307  String nameOfDeepSegmentation,
308  String nameOfFlatSegmentation,
309  boolean includeFullText, CorpusData cd) throws JDOMException, IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, SAXException, XPathExpressionException, URISyntaxException {
310 
311  Document finalDocument = null;
312  String skeleton_stylesheet = cio.readInternalResourceAsString(TEI_SKELETON_STYLESHEET_ISO);
313 
314  String transform_stylesheet = cio.readInternalResourceAsString(SC_TO_TEI_U_STYLESHEET_ISO);
315 
316  String sort_and_clean_stylesheet = cio.readInternalResourceAsString(SORT_AND_CLEAN_STYLESHEET_ISO);
317 
318  String time_2_token_stylesheet = cio.readInternalResourceAsString(TIME2TOKEN_SPAN_REFERENCES);
319  String remove_time_stylesheet = cio.readInternalResourceAsString(REMOVE_TIME);
320  String spans_2_attributes_stylesheet = cio.readInternalResourceAsString(SPANS2_ATTRIBUTES);
321 
322  Document teiDocument = null;
323 
324  XSLTransformer xslt = new XSLTransformer();
325  //transform wants an xml as string object and xsl as String Object
326  //System.out.println(skeleton_stylesheet);
327  String result
328  = xslt.transform(TypeConverter.JdomDocument2String(segmentedTranscription), skeleton_stylesheet);
329  if (result != null) {
330  //now we get a document of the first transformation, the iso tei skeleton
331  teiDocument = TypeConverter.String2JdomDocument(result);
332  System.out.println("STEP 1 completed.");
333  //cio.write(teiDocument, new URL(intermediate1));
334 
335  /*
336  * this method will take the segmented transcription and, for each speaker
337  * contribution in the segmentation with the name 'nameOfDeepSegmentation'
338  * will add anchors from the segmentation with the name
339  * 'nameOfFlatSegmentation' such that the temporal information provided in
340  * the flat segmentation is completely represented as anchors within the
341  * deep segmentation. The typical application scenario is to give this
342  * method a segmented HIAT transcription with nameOfDeepSegmentation =
343  * 'SpeakerContribution_Utterance_Word' nameOfFlatSegmentation =
344  * 'SpeakerContribution_Event'
345  */
346  Vector uElements = TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation, includeFullText);
347 
348  XPath xp = XPath.newInstance(BODY_NODE);
349  BODY_NODE = "//tei:body";
350  xp = XPath.newInstance(BODY_NODE);
351  xp.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
352 
353  Element textNode = (Element) (xp.selectSingleNode(teiDocument));
354  textNode.addContent(uElements);
355  if (teiDocument != null) {
356  System.out.println("STEP 2 completed.");
357  //cio.write(teiDocument, new URL(intermediate2));
358  Document transformedDocument = null;
359  if (INEL) {
360  xslt.setParameter("mode", "inel");
361  }
362  String result2
363  = xslt.transform(TypeConverter.JdomDocument2String(teiDocument), transform_stylesheet);
364  transformedDocument = IOUtilities.readDocumentFromString(result2);
365  if (transformedDocument != null) {
366  //fix for issue #89
367  textNode = (Element) (xp.selectSingleNode(transformedDocument));
368  System.out.println("STEP 3 completed.");
369  //cio.write(transformedDocument, new URL(intermediate3));
370  // now take care of the events from tiers of type 'd'
371  XPath xp2 = XPath.newInstance("//segmentation[@name='Event']/ats");
372  List events = xp2.selectNodes(segmentedTranscription);
373  for (int pos = 0; pos < events.size(); pos++) {
374  Element exmaraldaEvent = (Element) (events.get(pos));
375  String category = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue("category");
376 
377  String elementName = "event";
378  if (category.equals("pause")) {
379  elementName = "pause";
380  }
381 
382  Element teiEvent = new Element(elementName);
383 
384  String speakerID = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue("speaker");
385  if (speakerID != null) {
386  teiEvent.setAttribute("who", speakerID);
387  }
388  teiEvent.setAttribute("start", exmaraldaEvent.getAttributeValue("s"));
389  teiEvent.setAttribute("end", exmaraldaEvent.getAttributeValue("e"));
390  if (!category.equals("pause")) {
391  teiEvent.setAttribute("desc", exmaraldaEvent.getText());
392  teiEvent.setAttribute("type", category);
393  } else {
394  String duration = exmaraldaEvent.getText().replaceAll("\\(", "").replaceAll("\\)", "");
395  teiEvent.setAttribute("dur", duration);
396  }
397  textNode.addContent(teiEvent);
398  }
399  if (TOKEN) {
400  /*
401  HAMATAC ISO TEI VERSION from Thomas:
402  (2) Ein Mapping von zeitbasierten <span>s auf tokenbasierte <span>s,
403  d.h. @to and @from zeigen danach auf Token-IDs statt auf Timeline-IDs.
404  Das macht ein Stylesheet:
405  https://github.com/EXMARaLDA/exmaralda/blob/master/src/org/exmaralda/tei/xml/time2tokenSpanReferences.xsl
406  */
407  //System.out.println("Document is: " + TypeConverter.JdomDocument2String(transformedDocument));
408  String result4
409  = xslt.transform(TypeConverter.JdomDocument2String(transformedDocument), time_2_token_stylesheet);
410  /*
411  (3) Das Löschen von "überflüssigen" <when> und <anchor>-Elementen,
412  also solchen, die im PE gebraucht wurden, um Annotationen zu
413  spezifizieren, die aber sonst keine Information (absolute Zeitwerte)
414  tragen. Wenn <span>s nach Schritt (2) nicht mehr auf Timeline-IDs
415  zeigen, braucht man diese Elemente nicht mehr wirklich (schaden tun
416  sie aber eigentlich auch nicht)
417  macht auch ein Stylesheet:
418  https://github.com/EXMARaLDA/exmaralda/blob/master/src/org/exmaralda/tei/xml/removeTimepointsWithoutAbsolute.xsl
419  */
420  String result5
421  = xslt.transform(result4, remove_time_stylesheet);
422  String result6
423  = xslt.transform(result5, spans_2_attributes_stylesheet);
424  transformedDocument = IOUtilities.readDocumentFromString(result6);
425 
426  }
427  //generate element ids
428  generateWordIDs(transformedDocument);
429  //cio.write(transformedDocument, new URL(intermediate4));
430  if (transformedDocument != null) {
431  //Here the annotations are taken care of
432  //this is important for the INEL morpheme segmentations
433  //for the INEL transformation, the word IDs are generated earlier
434  String result3
435  = xslt.transform(TypeConverter.JdomDocument2String(transformedDocument), sort_and_clean_stylesheet);
436  if (result3 != null) {
437  finalDocument = IOUtilities.readDocumentFromString(result3);
438  if (finalDocument != null) {
439  //cio.write(finalDocument, new URL(intermediate5));
440  }
441  }
442  }
443  }
444  }
445  }
446  return finalDocument;
447  }
448 
449  public static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation) {
450  return TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation, false);
451  }
452 
471  public static Vector TEIMerge(Document segmentedTranscription,
472  String nameOfDeepSegmentation,
473  String nameOfFlatSegmentation,
474  boolean includeFullText) {
475  try {
476 
477  // Make a map of the timeline
478  Hashtable timelineItems = new Hashtable();
479  String xpath = "//tli";
480  XPath xpx = XPath.newInstance(xpath);
481  List tlis = xpx.selectNodes(segmentedTranscription);
482  for (int pos = 0; pos < tlis.size(); pos++) {
483 
484  timelineItems.put(((Element) (tlis.get(pos))).getAttributeValue("id"), pos);
485  }
486 
487  Vector returnValue = new Vector();
488  XPath xp1 = XPath.newInstance("//segmentation[@name='" + nameOfDeepSegmentation + "']/ts");
489  List segmentChains = xp1.selectNodes(segmentedTranscription);
490  // go through all top level segment chains
491  for (Object segmentChain : segmentChains) {
492  Element sc = (Element) (segmentChain);
493  sc.setAttribute("speaker", sc.getParentElement().getParentElement().getAttributeValue("speaker"));
494  String tierref = sc.getParentElement().getAttributeValue("tierref");
495  String start = sc.getAttributeValue("s");
496  String end = sc.getAttributeValue("e");
497  String xpath2 = "//segmentation[@name='" + nameOfFlatSegmentation + "' and @tierref='" + tierref + "']"
498  + "/ts[@s='" + start + "' and @e='" + end + "']";
499  XPath xp2 = XPath.newInstance(xpath2);
500  Element sc2 = (Element) (xp2.selectSingleNode(segmentedTranscription));
501  if (sc2 == null) {
502  //this means that no corresponding top level
503  //element was found in the second segmentation
504  //which should not happen
505  throw new Exception(tierref + " " + start + " " + end);
506  }
507  // this is where the magic happens
508  Element mergedElement = merge(sc, sc2);
509 
510  // now take care of the corresponding annotations
511  int s = ((Integer) (timelineItems.get(start)));
512  int e = ((Integer) (timelineItems.get(end)));
513  //We would also like to keep the FlatSegmentation as an annotation to display it correctly
514  if (INEL) {
515  String xpath3 = "//segmentation[@name='" + nameOfFlatSegmentation + "' and @tierref='" + tierref + "']"
516  + "/ts[@s='" + start + "' and @e='" + end + "']/ts";
517  XPath xp3 = XPath.newInstance(xpath3);
518  List transannos = xp3.selectNodes(segmentedTranscription);
519  for (Object transanno1 : transannos) {
520  Element transanno = (Element) transanno1;
521  String transaStart = transanno.getAttributeValue("s");
522  String transaEnd = transanno.getAttributeValue("e");
523  int transas = ((Integer) (timelineItems.get(transaStart)));
524  int transae = ((Integer) (timelineItems.get(transaEnd)));
525  boolean transannotationBelongsToThisElement = (transas >= s && transas <= e) || (transae >= s && transae <= e);
526  if (transannotationBelongsToThisElement) {
527  Element annotationsElement = mergedElement.getChild("annotations");
528  if (annotationsElement == null) {
529  annotationsElement = new Element("annotations");
530  mergedElement.addContent(annotationsElement);
531  }
532  Element annotation = new Element("annotation");
533  annotation.setAttribute("start", transaStart);
534  annotation.setAttribute("end", transaEnd);
535  annotation.setAttribute("level", transanno.getParentElement().getParentElement().getAttributeValue("name"));
536  annotation.setAttribute("value", transanno.getText());
537  annotationsElement.addContent(annotation);
538  }
539  }
540  }
541  // now take care of the corresponding annotations
542  String xpath5 = "//segmented-tier[@id='" + tierref + "']/annotation/ta";
543  XPath xp5 = XPath.newInstance(xpath5);
544  List annotations = xp5.selectNodes(segmentedTranscription);
545  for (Object annotation1 : annotations) {
546  Element anno = (Element) (annotation1);
547  String aStart = anno.getAttributeValue("s");
548  String aEnd = anno.getAttributeValue("e");
549  int as = ((Integer) (timelineItems.get(aStart)));
550  int ae = ((Integer) (timelineItems.get(aEnd)));
551  boolean annotationBelongsToThisElement = (as >= s && as <= e) || (ae >= s && ae <= e);
552  if (annotationBelongsToThisElement) {
553  Element annotationsElement = mergedElement.getChild("annotations");
554  if (annotationsElement == null) {
555  annotationsElement = new Element("annotations");
556  mergedElement.addContent(annotationsElement);
557  }
558  Element annotation = new Element("annotation");
559  annotation.setAttribute("start", aStart);
560  annotation.setAttribute("end", aEnd);
561  annotation.setAttribute("level", anno.getParentElement().getAttributeValue("name"));
562  annotation.setAttribute("value", anno.getText());
563  annotationsElement.addContent(annotation);
564  }
565 
566  //System.out.println(s + "/" + e + " **** " + as + "/" + ae);
567  }
568 
569  //*****************************************
570  // NEW 25-04-2016
571  // include full text if Daniel J. wisheth thus
572  if (includeFullText) {
573  Element annotation = new Element("annotation");
574  annotation.setAttribute("start", start);
575  annotation.setAttribute("end", end);
576  annotation.setAttribute("level", "full-text");
577 
578  String fullText = "";
579  List l = XPath.selectNodes(sc2, "descendant::text()");
580  for (Object o : l) {
581  Text text = (Text) o;
582  fullText += text.getText();
583  }
584  annotation.setAttribute("value", fullText);
585 
586  Element annotationsElement = mergedElement.getChild("annotations");
587  if (annotationsElement == null) {
588  annotationsElement = new Element("annotations");
589  mergedElement.addContent(annotationsElement);
590  }
591  annotationsElement.addContent(annotation);
592  }
593  //*****************************************
594 
595  returnValue.addElement(mergedElement.detach());
596  }
597 
598  // issue #89 - Now the vector contains elements only from the
599  // segmentations passed as parameters
600  // in particular, it seems that tiers of type 'd' (which end up as
601  // segmentation @name='Event' are lost
602  return returnValue;
603  } catch (JDOMException ex) {
604  ex.printStackTrace();
605  } catch (Exception ex) {
606  ex.printStackTrace();
607  }
608  return null;
609  }
610 
611  static Element merge(Element e1, Element e2) {
612 
613  Iterator i1 = e1.getDescendants();
614  Vector pcData1 = new Vector();
615  while (i1.hasNext()) {
616  pcData1.addElement(i1.next());
617  }
618 
619  Iterator i2 = e2.getDescendants(new TextFilter());
620  Vector pcData2 = new Vector();
621  while (i2.hasNext()) {
622  pcData2.addElement(i2.next());
623  }
624 
625  int charBoundary = 0;
626  for (int pos = 0; pos < pcData2.size() - 1; pos++) {
627  Text eventText = (Text) (pcData2.elementAt(pos));
628  Element anchor = new Element("anchor");
629  Element event = eventText.getParentElement();
630  String start = event.getAttributeValue("e");
631  anchor.setAttribute("synch", start);
632 
633  charBoundary += eventText.getText().length();
634  // jetzt durch den anderen Baum laufen und den zugehoerigen Anker
635  // an der richtigen Stelle einfuegen
636  int charCount = 0;
637  for (int pos2 = 0; pos2 < pcData1.size(); pos2++) {
638  Object o = pcData1.elementAt(pos2);
639  if (!(o instanceof Text)) {
640  continue;
641  }
642  Text segmentText = (Text) o;
643  int textLength = segmentText.getText().length();
644  if (charCount + textLength < charBoundary) {
645  charCount += textLength;
646  continue;
647  } else if (charCount + textLength == charBoundary) {
648  Element parent = segmentText.getParentElement();
649  int index = parent.indexOf(segmentText);
650  Element parentOfParent = parent.getParentElement();
651  int index2 = parentOfParent.indexOf(parent);
652  parentOfParent.addContent(index2 + 1, anchor);
653  break;
654  }
655  // charCount+textLength>charBoundary
656  String leftPart = segmentText.getText().substring(0, charBoundary - charCount);
657  String rightPart = segmentText.getText().substring(charBoundary - charCount);
658  Text leftText = new Text(leftPart);
659  Text rightText = new Text(rightPart);
660 
661  // neue Sachen muessen zweimal eingefuegt werden - einmal
662  // in den Vector, einmal in den Parent
663  // Sachen im Vector muessen den richtigen Parent bekommen
664  Element parent = segmentText.getParentElement();
665  parent.removeContent(segmentText);
666  parent.addContent(leftText);
667  parent.addContent(anchor);
668  parent.addContent(rightText);
669 
670  pcData1.remove(segmentText);
671  pcData1.add(pos2, rightText);
672  pcData1.add(pos2, anchor);
673  pcData1.add(pos2, leftText);
674  break;
675  }
676  }
677 
678  return e1;
679  }
680 
681  // new 30-03-2016
682  //this needed to be adapted to morpheme ids - and changed for the word IDs too
683  //and we need to generate the spans for the morphemes somewhere too
684  private void generateWordIDs(Document document) throws JDOMException {
685  // added 30-03-2016
686  HashSet<String> allExistingIDs = new HashSet<String>();
687  XPath idXPath = XPath.newInstance("//tei:*[@xml:id]");
688  idXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
689  idXPath.addNamespace(Namespace.XML_NAMESPACE);
690  List idElements = idXPath.selectNodes(document);
691  for (Object o : idElements) {
692  Element e = (Element) o;
693  allExistingIDs.add(e.getAttributeValue("id", Namespace.XML_NAMESPACE));
694  }
695 
696  // changed 30-03-2016
697  XPath wordXPath = XPath.newInstance("//tei:w[not(@xml:id)]");
698  wordXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
699  wordXPath.addNamespace(Namespace.XML_NAMESPACE);
700 
701  List words = wordXPath.selectNodes(document);
702  int count = 1;
703  for (Object o : words) {
704  Element word = (Element) o;
705  while (allExistingIDs.contains("w" + Integer.toString(count))) {
706  count++;
707  }
708 
709  String wordID = "w" + Integer.toString(count);
710  allExistingIDs.add(wordID);
711  //System.out.println("*** " + wordID);
712  word.setAttribute("id", wordID, Namespace.XML_NAMESPACE);
713  }
714 
715  // new 02-12-2014
716  XPath pcXPath = XPath.newInstance("//tei:pc[not(@xml:id)]");
717  pcXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
718  pcXPath.addNamespace(Namespace.XML_NAMESPACE);
719 
720  List pcs = pcXPath.selectNodes(document);
721  count = 1;
722  for (Object o : pcs) {
723  Element pc = (Element) o;
724  while (allExistingIDs.contains("pc" + Integer.toString(count))) {
725  count++;
726  }
727 
728  String pcID = "pc" + Integer.toString(count);
729  allExistingIDs.add(pcID);
730  //System.out.println("*** " + wordID);
731  pc.setAttribute("id", pcID, Namespace.XML_NAMESPACE);
732  }
733  if (INEL) {
734  // we also need this for events/incidents
735  XPath incXPath = XPath.newInstance("//tei:event[not(@xml:id)]");
736  pcXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
737  pcXPath.addNamespace(Namespace.XML_NAMESPACE);
738 
739  List incs = incXPath.selectNodes(document);
740  count = 1;
741  for (Object o : incs) {
742  Element pc = (Element) o;
743  while (allExistingIDs.contains("inc" + Integer.toString(count))) {
744  count++;
745  }
746 
747  String incID = "inc" + Integer.toString(count);
748  allExistingIDs.add(incID);
749  //System.out.println("*** " + wordID);
750  pc.setAttribute("id", incID, Namespace.XML_NAMESPACE);
751  }
752 
753  // we also need this for seg elements
754  XPath segXPath = XPath.newInstance("//tei:seg[not(@xml:id)]");
755  pcXPath.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
756  pcXPath.addNamespace(Namespace.XML_NAMESPACE);
757 
758  List segs = segXPath.selectNodes(document);
759  count = 1;
760  for (Object o : segs) {
761  Element seg = (Element) o;
762  while (allExistingIDs.contains("seg" + Integer.toString(count))) {
763  count++;
764  }
765 
766  String segID = "seg" + Integer.toString(count);
767  allExistingIDs.add(segID);
768  //System.out.println("*** " + wordID);
769  seg.setAttribute("id", segID, Namespace.XML_NAMESPACE);
770  }
771  }
772  }
773 
774  private void setDocLanguage(Document teiDoc, String language) throws JDOMException {
775  // /TEI/text[1]/@*[namespace-uri()='http://www.w3.org/XML/1998/namespace' and local-name()='lang']
776  XPath xpathToLangAttribute = XPath.newInstance("//tei:text/@xml:lang");
777  xpathToLangAttribute.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
778  xpathToLangAttribute.addNamespace(Namespace.XML_NAMESPACE);
779  Attribute langAtt = (Attribute) xpathToLangAttribute.selectSingleNode(teiDoc);
780  if (langAtt != null) {
781  langAtt.setValue(language);
782  } else {
783  XPath xpathToTextElement = XPath.newInstance("//tei:text");
784  xpathToTextElement.addNamespace("tei", "http://www.tei-c.org/ns/1.0");
785  xpathToTextElement.addNamespace(Namespace.XML_NAMESPACE);
786  Element textEl = (Element) xpathToTextElement.selectSingleNode(teiDoc);
787  textEl.setAttribute("lang", language, Namespace.XML_NAMESPACE);
788  }
789  System.out.println("Language of document set to " + language);
790 
791  }
792 
793  public void setLanguage(String lang) {
794  language = lang;
795  }
796 
797  public void setInel() {
798  INEL = true;
799  }
800 
801  public void setToken() {
802  TOKEN = true;
803  }
804 
805  public void setFSM(String newfsm) {
806  FSM = newfsm;
807  }
808 
809  @Override
810  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
811  try {
812  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
813  IsUsableFor.add(cl);
814  //Coma will only be used if a corpus is supplied
815  //Class cl3 = Class.forName("de.uni_hamburg.corpora.ComaData");
816  //IsUsableFor.add(cl3);
817  } catch (ClassNotFoundException ex) {
818  report.addException(ex, "unknown class not found error");
819  }
820  return IsUsableFor;
821  }
822 
823  @Override
824  public String getDescription() {
825  String description = "This class takes an exb as input and converts it into ISO standard TEI format. ";
826  return description;
827  }
828 
829 }
void setParameter(String parameterName, Object parameterValue)
CorpusData readFileURL(URL url, Collection< Class<?extends CorpusData >> clcds)
Definition: CorpusIO.java:125
Collection< Class<?extends CorpusData > > getIsUsableFor()
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText)
void addCritical(String description)
Definition: Report.java:104
Report convertEXB2MORPHEMEHIATISOTEI(CorpusData cd, boolean includeFullText, String XPath2Morphemes)
String readInternalResourceAsString(String path2resource)
Definition: CorpusIO.java:192
void addCorrect(String statId, String description)
Definition: Report.java:217
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
Document SegmentedTranscriptionToTEITranscription(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText, CorpusData cd)
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation)