6 package de.uni_hamburg.corpora.conversion;
15 import java.io.IOException;
16 import java.util.HashSet;
17 import java.util.Hashtable;
18 import java.util.List;
19 import java.util.Vector;
21 import org.exmaralda.common.jdomutilities.IOUtilities;
22 import org.exmaralda.partitureditor.fsm.FSMException;
23 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
24 import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
26 import java.io.InputStream;
27 import java.net.MalformedURLException;
29 import java.nio.file.Paths;
30 import org.exmaralda.partitureditor.jexmaralda.segment.HIATSegmentation;
31 import org.jdom.Attribute;
32 import org.jdom.Document;
33 import org.jdom.Element;
34 import org.jdom.JDOMException;
35 import org.jdom.Namespace;
37 import org.jdom.xpath.XPath;
38 import org.xml.sax.SAXException;
40 import javax.xml.parsers.ParserConfigurationException;
41 import javax.xml.
transform.TransformerException;
42 import javax.xml.xpath.XPathExpressionException;
43 import org.exmaralda.common.corpusbuild.TextFilter;
53 String language =
"en";
55 final String
function =
"inel iso tei";
58 static String TEI_SKELETON_STYLESHEET_ISO =
"/xsl/EXMARaLDA2ISOTEI_Skeleton.xsl";
59 static String SC_TO_TEI_U_STYLESHEET_ISO =
"/xsl/SegmentChain2ISOTEIUtteranceINEL.xsl";
60 static String SORT_AND_CLEAN_STYLESHEET_ISO =
"/xsl/ISOTEICleanAndSortINEL.xsl";
61 static String FSM =
"/xsl/INEL_Segmentation_FSM.xml";
63 static String BODY_NODE =
"//text";
66 String XPath2Morphemes =
"/basic-transcription/basic-body/tier[@id = \"mb\"]";
68 String nameOfDeepSegmentation =
"SpeakerContribution_Utterance_Word";
69 String nameOfFlategmentation =
"SpeakerContribution_Event";
79 super(
"EXB2INELISOTEI");
98 boolean includeFullText, String XPath2Morphemes) {
105 System.out.println((cd.
getURL()).getFile());
106 System.out.println(
"started writing document...");
110 HIATSegmentation segmentation;
113 InputStream is = getClass().getResourceAsStream(FSM);
115 URL url = Paths.get(System.getProperty(
"java.io.tmpdir")+
"/" +
"fsmstring.xml").toUri().toURL();
116 cio.
write(fsmstring, url);
117 segmentation =
new HIATSegmentation(url.getFile());
121 segmentation =
new HIATSegmentation();
125 SegmentedTranscription st = segmentation.BasicToSegmented(bt);
126 System.out.println(
"Segmented transcription created");
132 nameOfDeepSegmentation,
133 nameOfFlategmentation,
134 includeFullText, cd);
135 if (teiDoc != null) {
136 System.out.println(
"Merged");
138 setDocLanguage(teiDoc, language);
140 String filename = cd.
getURL().getFile();
141 URL url =
new URL(
"file://" + filename.substring(0, filename.lastIndexOf(
".")) +
"_tei.xml");
142 cio.
write(teiDoc, url);
144 System.out.println(
"document written.");
145 report.
addCorrect(
function, cd,
"ISO TEI conversion of file was successful");
147 report.
addCritical(
function, cd,
"ISO TEI conversion of file was not possible because of unknown error");
150 }
catch (SAXException ex) {
151 report.
addException(ex,
function, cd,
"Unknown exception error");
152 }
catch (FSMException ex) {
153 report.
addException(ex,
function, cd,
"Unknown finite state machine error");
154 }
catch (MalformedURLException ex) {
155 report.
addException(ex,
function, cd,
"Unknown file URL reading error");
156 }
catch (JDOMException ex) {
157 report.
addException(ex,
function, cd,
"Unknown file reading error");
158 }
catch (IOException ex) {
159 report.
addException(ex,
function, cd,
"Unknown file reading error");
160 }
catch (TransformerException ex) {
161 report.
addException(ex,
function, cd,
"XSL transformer error");
162 }
catch (ParserConfigurationException ex) {
164 }
catch (XPathExpressionException ex) {
171 String nameOfDeepSegmentation,
172 String nameOfFlatSegmentation,
173 boolean includeFullText,
CorpusData cd)
throws JDOMException, IOException, TransformerException {
175 Document finalDocument = null;
182 Document teiDocument = null;
189 if (result != null) {
192 System.out.println(
"STEP 1 completed.");
206 Vector uElements =
TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation, includeFullText);
208 XPath xp = XPath.newInstance(BODY_NODE);
209 BODY_NODE =
"//tei:body";
210 xp = XPath.newInstance(BODY_NODE);
211 xp.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
213 Element textNode = (Element) (xp.selectSingleNode(teiDocument));
214 textNode.addContent(uElements);
215 if (teiDocument != null) {
216 System.out.println(
"STEP 2 completed.");
218 Document transformedDocument = null;
221 transformedDocument = IOUtilities.readDocumentFromString(result2);
222 if (transformedDocument != null) {
224 textNode = (Element) (xp.selectSingleNode(transformedDocument));
225 System.out.println(
"STEP 3 completed.");
227 XPath xp2 = XPath.newInstance(
"//segmentation[@name='Event']/ats");
228 List events = xp2.selectNodes(segmentedTranscription);
229 for (
int pos = 0; pos < events.size(); pos++) {
230 Element exmaraldaEvent = (Element) (events.get(pos));
231 String category = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue(
"category");
233 String elementName =
"event";
234 if (category.equals(
"pause")) {
235 elementName =
"pause";
238 Element teiEvent =
new Element(elementName);
240 String speakerID = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue(
"speaker");
241 if (speakerID != null) {
242 teiEvent.setAttribute(
"who", speakerID);
244 teiEvent.setAttribute(
"start", exmaraldaEvent.getAttributeValue(
"s"));
245 teiEvent.setAttribute(
"end", exmaraldaEvent.getAttributeValue(
"e"));
246 if (!category.equals(
"pause")) {
247 teiEvent.setAttribute(
"desc", exmaraldaEvent.getText());
248 teiEvent.setAttribute(
"type", category);
250 String duration = exmaraldaEvent.getText().replaceAll(
"\\(",
"").replaceAll(
"\\)",
"");
251 teiEvent.setAttribute(
"dur", duration);
253 textNode.addContent(teiEvent);
258 generateWordIDs(transformedDocument);
259 if (transformedDocument != null) {
265 if (result3 != null) {
266 finalDocument = IOUtilities.readDocumentFromString(result3);
267 if (finalDocument != null) {
275 return finalDocument;
278 public static Vector
TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation) {
279 return TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation,
false);
300 public static Vector
TEIMerge(Document segmentedTranscription,
301 String nameOfDeepSegmentation,
302 String nameOfFlatSegmentation,
303 boolean includeFullText) {
307 Hashtable timelineItems =
new Hashtable();
308 String xpath =
"//tli";
309 XPath xpx = XPath.newInstance(xpath);
310 List tlis = xpx.selectNodes(segmentedTranscription);
311 for (
int pos = 0; pos < tlis.size(); pos++) {
313 timelineItems.put(((Element) (tlis.get(pos))).getAttributeValue(
"id"), pos);
316 Vector returnValue =
new Vector();
317 XPath xp1 = XPath.newInstance(
"//segmentation[@name='" + nameOfDeepSegmentation +
"']/ts");
318 List segmentChains = xp1.selectNodes(segmentedTranscription);
320 for (Object segmentChain : segmentChains) {
321 Element sc = (Element) (segmentChain);
322 sc.setAttribute(
"speaker", sc.getParentElement().getParentElement().getAttributeValue(
"speaker"));
323 String tierref = sc.getParentElement().getAttributeValue(
"tierref");
324 String start = sc.getAttributeValue(
"s");
325 String end = sc.getAttributeValue(
"e");
326 String xpath2 =
"//segmentation[@name='" + nameOfFlatSegmentation +
"' and @tierref='" + tierref +
"']" 327 +
"/ts[@s='" + start +
"' and @e='" + end +
"']";
328 XPath xp2 = XPath.newInstance(xpath2);
329 Element sc2 = (Element) (xp2.selectSingleNode(segmentedTranscription));
334 throw new Exception(tierref +
" " + start +
" " + end);
337 Element mergedElement = merge(sc, sc2);
338 int s = ((Integer) (timelineItems.get(start)));
339 int e = ((Integer) (timelineItems.get(end)));
342 String xpath3 =
"//segmentation[@name='" + nameOfFlatSegmentation +
"' and @tierref='" + tierref +
"']" 343 +
"/ts[@s='" + start +
"' and @e='" + end +
"']/ts";
344 XPath xp3 = XPath.newInstance(xpath3);
345 List transannos = xp3.selectNodes(segmentedTranscription);
346 for (Object transanno1 : transannos) {
347 Element transanno = (Element) transanno1;
348 String transaStart = transanno.getAttributeValue(
"s");
349 String transaEnd = transanno.getAttributeValue(
"e");
350 int transas = ((Integer) (timelineItems.get(transaStart)));
351 int transae = ((Integer) (timelineItems.get(transaEnd)));
352 boolean transannotationBelongsToThisElement = (transas >= s && transas <= e) || (transae >= s && transae <= e);
353 if (transannotationBelongsToThisElement) {
354 Element annotationsElement = mergedElement.getChild(
"annotations");
355 if (annotationsElement == null) {
356 annotationsElement =
new Element(
"annotations");
357 mergedElement.addContent(annotationsElement);
359 Element annotation =
new Element(
"annotation");
360 annotation.setAttribute(
"start", transaStart);
361 annotation.setAttribute(
"end", transaEnd);
362 annotation.setAttribute(
"level", transanno.getParentElement().getParentElement().getAttributeValue(
"name"));
363 annotation.setAttribute(
"value", transanno.getText());
364 annotationsElement.addContent(annotation);
368 String xpath5 =
"//segmented-tier[@id='" + tierref +
"']/annotation/ta";
369 XPath xp5 = XPath.newInstance(xpath5);
370 List annotations = xp5.selectNodes(segmentedTranscription);
371 for (Object annotation1 : annotations) {
372 Element anno = (Element) (annotation1);
373 String aStart = anno.getAttributeValue(
"s");
374 String aEnd = anno.getAttributeValue(
"e");
375 int as = ((Integer) (timelineItems.get(aStart)));
376 int ae = ((Integer) (timelineItems.get(aEnd)));
377 boolean annotationBelongsToThisElement = (as >= s && as <= e) || (ae >= s && ae <= e);
378 if (annotationBelongsToThisElement) {
379 Element annotationsElement = mergedElement.getChild(
"annotations");
380 if (annotationsElement == null) {
381 annotationsElement =
new Element(
"annotations");
382 mergedElement.addContent(annotationsElement);
384 Element annotation =
new Element(
"annotation");
385 annotation.setAttribute(
"start", aStart);
386 annotation.setAttribute(
"end", aEnd);
387 annotation.setAttribute(
"level", anno.getParentElement().getAttributeValue(
"name"));
388 annotation.setAttribute(
"value", anno.getText());
389 annotationsElement.addContent(annotation);
398 if (includeFullText) {
399 Element annotation =
new Element(
"annotation");
400 annotation.setAttribute(
"start", start);
401 annotation.setAttribute(
"end", end);
402 annotation.setAttribute(
"level",
"full-text");
404 String fullText =
"";
405 List l = XPath.selectNodes(sc2,
"descendant::text()");
407 Text text = (Text) o;
408 fullText += text.getText();
410 annotation.setAttribute(
"value", fullText);
412 Element annotationsElement = mergedElement.getChild(
"annotations");
413 if (annotationsElement == null) {
414 annotationsElement =
new Element(
"annotations");
415 mergedElement.addContent(annotationsElement);
417 annotationsElement.addContent(annotation);
421 returnValue.addElement(mergedElement.detach());
429 }
catch (JDOMException ex) {
430 ex.printStackTrace();
431 }
catch (Exception ex) {
432 ex.printStackTrace();
437 static Element merge(Element e1, Element e2) {
439 Iterator i1 = e1.getDescendants();
440 Vector pcData1 =
new Vector();
441 while (i1.hasNext()) {
442 pcData1.addElement(i1.next());
445 Iterator i2 = e2.getDescendants(
new TextFilter());
446 Vector pcData2 =
new Vector();
447 while (i2.hasNext()) {
448 pcData2.addElement(i2.next());
451 int charBoundary = 0;
452 for (
int pos = 0; pos < pcData2.size() - 1; pos++) {
453 Text eventText = (Text) (pcData2.elementAt(pos));
454 Element anchor =
new Element(
"anchor");
455 Element
event = eventText.getParentElement();
456 String start =
event.getAttributeValue(
"e");
457 anchor.setAttribute(
"synch", start);
459 charBoundary += eventText.getText().length();
463 for (
int pos2 = 0; pos2 < pcData1.size(); pos2++) {
464 Object o = pcData1.elementAt(pos2);
465 if (!(o instanceof Text)) {
468 Text segmentText = (Text) o;
469 int textLength = segmentText.getText().length();
470 if (charCount + textLength < charBoundary) {
471 charCount += textLength;
473 }
else if (charCount + textLength == charBoundary) {
474 Element parent = segmentText.getParentElement();
475 int index = parent.indexOf(segmentText);
476 Element parentOfParent = parent.getParentElement();
477 int index2 = parentOfParent.indexOf(parent);
478 parentOfParent.addContent(index2 + 1, anchor);
482 String leftPart = segmentText.getText().substring(0, charBoundary - charCount);
483 String rightPart = segmentText.getText().substring(charBoundary - charCount);
484 Text leftText =
new Text(leftPart);
485 Text rightText =
new Text(rightPart);
490 Element parent = segmentText.getParentElement();
491 parent.removeContent(segmentText);
492 parent.addContent(leftText);
493 parent.addContent(anchor);
494 parent.addContent(rightText);
496 pcData1.remove(segmentText);
497 pcData1.add(pos2, rightText);
498 pcData1.add(pos2, anchor);
499 pcData1.add(pos2, leftText);
510 private void generateWordIDs(Document document)
throws JDOMException {
512 HashSet<String> allExistingIDs =
new HashSet<String>();
513 XPath idXPath = XPath.newInstance(
"//tei:*[@xml:id]");
514 idXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
515 idXPath.addNamespace(Namespace.XML_NAMESPACE);
516 List idElements = idXPath.selectNodes(document);
517 for (Object o : idElements) {
518 Element e = (Element) o;
519 allExistingIDs.add(e.getAttributeValue(
"id", Namespace.XML_NAMESPACE));
523 XPath wordXPath = XPath.newInstance(
"//tei:w[not(@xml:id)]");
524 wordXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
525 wordXPath.addNamespace(Namespace.XML_NAMESPACE);
527 List words = wordXPath.selectNodes(document);
529 for (Object o : words) {
530 Element word = (Element) o;
531 while (allExistingIDs.contains(
"w" + Integer.toString(count))) {
535 String wordID =
"w" + Integer.toString(count);
536 allExistingIDs.add(wordID);
538 word.setAttribute(
"id", wordID, Namespace.XML_NAMESPACE);
542 XPath pcXPath = XPath.newInstance(
"//tei:pc[not(@xml:id)]");
543 pcXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
544 pcXPath.addNamespace(Namespace.XML_NAMESPACE);
546 List pcs = pcXPath.selectNodes(document);
548 for (Object o : pcs) {
549 Element pc = (Element) o;
550 while (allExistingIDs.contains(
"pc" + Integer.toString(count))) {
554 String pcID =
"pc" + Integer.toString(count);
555 allExistingIDs.add(pcID);
557 pc.setAttribute(
"id", pcID, Namespace.XML_NAMESPACE);
561 XPath incXPath = XPath.newInstance(
"//tei:event[not(@xml:id)]");
562 pcXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
563 pcXPath.addNamespace(Namespace.XML_NAMESPACE);
565 List incs = incXPath.selectNodes(document);
567 for (Object o : incs) {
568 Element pc = (Element) o;
569 while (allExistingIDs.contains(
"inc" + Integer.toString(count))) {
573 String incID =
"inc" + Integer.toString(count);
574 allExistingIDs.add(incID);
576 pc.setAttribute(
"id", incID, Namespace.XML_NAMESPACE);
580 XPath segXPath = XPath.newInstance(
"//tei:seg[not(@xml:id)]");
581 pcXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
582 pcXPath.addNamespace(Namespace.XML_NAMESPACE);
584 List segs = segXPath.selectNodes(document);
586 for (Object o : segs) {
587 Element seg = (Element) o;
588 while (allExistingIDs.contains(
"seg" + Integer.toString(count))) {
592 String segID =
"seg" + Integer.toString(count);
593 allExistingIDs.add(segID);
595 seg.setAttribute(
"id", segID, Namespace.XML_NAMESPACE);
600 private void setDocLanguage(Document teiDoc, String language)
throws JDOMException {
602 XPath xpathToLangAttribute = XPath.newInstance(
"//tei:text/@xml:lang");
603 xpathToLangAttribute.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
604 xpathToLangAttribute.addNamespace(Namespace.XML_NAMESPACE);
605 Attribute langAtt = (Attribute) xpathToLangAttribute.selectSingleNode(teiDoc);
606 if (langAtt != null) {
607 langAtt.setValue(language);
609 XPath xpathToTextElement = XPath.newInstance(
"//tei:text");
610 xpathToTextElement.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
611 xpathToTextElement.addNamespace(Namespace.XML_NAMESPACE);
612 Element textEl = (Element) xpathToTextElement.selectSingleNode(teiDoc);
613 textEl.setAttribute(
"lang", language, Namespace.XML_NAMESPACE);
615 System.out.println(
"Language of document set to " + language);
640 report =
function(cd);
647 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
649 }
catch (ClassNotFoundException ex) {
650 report.
addException(ex,
"unknown class not found error");
658 String description =
"This class takes an exb as input and converts it into ISO standard TEI format. ";
664 throw new UnsupportedOperationException(
"Not supported yet.");
Report fix(CorpusData cd)
BasicTranscription getEXMARaLDAbt()
Collection< Class<?extends CorpusData > > getIsUsableFor()
Report execute(Corpus c, boolean fix)
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText)
void addCritical(String description)
String readInternalResourceAsString(String path2resource)
static String InputStream2String(InputStream is)
Report convertCD2MORPHEMEHIATISOTEI(CorpusData cd, boolean includeFullText, String XPath2Morphemes)
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation)
void addCorrect(String statId, String description)
Document SegmentedTranscriptionToTEITranscription(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText, CorpusData cd)
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
Report check(CorpusData cd)
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
void setLanguage(String lang)
void write(CorpusData cd, URL url)