6 package de.uni_hamburg.corpora.conversion;
15 import java.io.IOException;
16 import java.util.HashSet;
17 import java.util.Hashtable;
18 import java.util.List;
19 import java.util.Vector;
21 import org.exmaralda.common.jdomutilities.IOUtilities;
22 import org.exmaralda.partitureditor.fsm.FSMException;
23 import org.exmaralda.partitureditor.jexmaralda.BasicTranscription;
24 import org.exmaralda.partitureditor.jexmaralda.SegmentedTranscription;
26 import java.io.UnsupportedEncodingException;
27 import java.net.MalformedURLException;
28 import java.net.URISyntaxException;
30 import org.exmaralda.partitureditor.jexmaralda.segment.HIATSegmentation;
31 import org.jdom.Attribute;
32 import org.jdom.Document;
33 import org.jdom.Element;
34 import org.jdom.JDOMException;
35 import org.jdom.Namespace;
37 import org.jdom.
transform.XSLTransformException;
38 import org.jdom.xpath.XPath;
39 import org.xml.sax.SAXException;
42 import javax.xml.parsers.ParserConfigurationException;
43 import javax.xml.transform.TransformerException;
44 import javax.xml.xpath.XPathExpressionException;
45 import org.exmaralda.common.corpusbuild.FileIO;
46 import org.exmaralda.common.corpusbuild.TextFilter;
47 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
60 String language =
"en";
63 static String TEI_SKELETON_STYLESHEET_ISO =
"/xsl/EXMARaLDA2ISOTEI_Skeleton.xsl";
64 static String SC_TO_TEI_U_STYLESHEET_ISO =
"/xsl/SegmentChain2ISOTEIUtterance.xsl";
65 static String SORT_AND_CLEAN_STYLESHEET_ISO =
"/xsl/ISOTEICleanAndSort.xsl";
66 static String TIME2TOKEN_SPAN_REFERENCES =
"/xsl/time2tokenSpanReferences.xsl";
67 static String REMOVE_TIME =
"/xsl/removeTimepointsWithoutAbsolute.xsl";
68 static String SPANS2_ATTRIBUTES =
"/xsl/spans2attributes.xsl";
70 static String FSM =
"";
72 static String BODY_NODE =
"//text";
75 String XPath2Morphemes =
"/basic-transcription/basic-body/tier[@id = \"mb\"]";
77 String nameOfDeepSegmentation =
"SpeakerContribution_Utterance_Word";
78 String nameOfFlategmentation =
"SpeakerContribution_Event";
93 static Boolean INEL =
false;
94 static Boolean TOKEN =
false;
118 XSLTransformException,
128 XSLTransformException,
145 Namespace teiNamespace = Namespace.getNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
146 Document comaDoc = FileIO.readDocumentFromLocalFile(cd.
getURL().getPath());
148 List<Element> communicationsList = XPath.selectNodes(comaDoc,
"//Communication");
150 for (Element communicationElement : communicationsList) {
152 List<Element> transcriptionsList = XPath.selectNodes(communicationElement,
"descendant::Transcription[ends-with(Filename,'.exb')]");
154 for (Element transcriptionElement : transcriptionsList) {
155 String transcriptID = transcriptionElement.getAttributeValue(
"Id");
156 String nsLink = transcriptionElement.getChildText(
"NSLink");
159 URL exburl =
new URL(fullPath);
164 nameOfDeepSegmentation,
165 nameOfFlategmentation,
169 Element transcriptIdnoElement =
new Element(
"idno", teiNamespace);
170 transcriptIdnoElement.setAttribute(
"type",
"HZSK-ID");
171 transcriptIdnoElement.setText(transcriptID);
172 finalDoc.getRootElement().addContent(0, transcriptIdnoElement);
174 XPath xp1 = XPath.newInstance(
"//tei:person");
175 xp1.addNamespace(teiNamespace);
176 List<Element> personL = xp1.selectNodes(finalDoc);
177 for (Element personE : personL) {
179 String personSigle = personE.getAttributeValue(
"n");
180 String xp2 =
"//Speaker[Sigle='" + personSigle +
"']";
181 Element speakerE = (Element) XPath.selectSingleNode(comaDoc, xp2);
182 String speakerID = speakerE.getAttributeValue(
"Id");
183 Element speakerIdnoElement =
new Element(
"idno", teiNamespace);
184 speakerIdnoElement.setAttribute(
"type",
"HZSK-ID");
185 speakerIdnoElement.setText(speakerID);
186 personE.addContent(0, speakerIdnoElement);
189 if (finalDoc != null) {
190 System.out.println(
"Merged");
192 setDocLanguage(finalDoc, language);
195 String filename = cdc.
getURL().getFile();
196 URL url =
new URL(
"file://" + filename.substring(0, filename.lastIndexOf(
".")) +
"_tei.xml");
197 System.out.println(url.toString());
198 cio.
write(finalDoc, url);
199 System.out.println(
"document written.");
200 stats.
addCorrect(
function, cdc,
"ISO TEI conversion of file was successful");
202 stats.
addCritical(
function, cdc,
"ISO TEI conversion of file was not possible because of unknown error");
208 }
catch (SAXException ex) {
209 stats.
addException(ex,
function, cd,
"Unknown exception error");
210 }
catch (FSMException ex) {
211 stats.
addException(ex,
function, cd,
"Unknown finite state machine error");
212 }
catch (MalformedURLException ex) {
213 stats.
addException(ex,
function, cd,
"Unknown file URL reading error");
214 }
catch (JDOMException ex) {
215 stats.
addException(ex,
function, cd,
"Unknown file reading error");
216 }
catch (IOException ex) {
217 stats.
addException(ex,
function, cd,
"Unknown file reading error");
218 }
catch (TransformerException ex) {
219 stats.
addException(ex,
function, cd,
"XSL transformer error");
220 }
catch (ParserConfigurationException ex) {
222 }
catch (XPathExpressionException ex) {
224 }
catch (URISyntaxException ex) {
225 stats.
addException(ex,
function, cd,
"ComaPath URI error");
226 }
catch (JexmaraldaException ex) {
227 stats.
addException(ex,
function, cd,
"Jexmeaalda error");
232 public Report convertEXB2MORPHEMEHIATISOTEI(
CorpusData cd)
throws SAXException, FSMException, JDOMException, IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException, URISyntaxException {
248 boolean includeFullText, String XPath2Morphemes)
throws SAXException, FSMException, JDOMException, IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, XPathExpressionException, URISyntaxException {
254 nameOfDeepSegmentation,
255 nameOfFlategmentation,
256 includeFullText, cd);
257 if (teiDoc != null) {
258 System.out.println(
"Merged");
260 setDocLanguage(teiDoc, language);
263 String filename = cd.
getURL().getFile();
264 URL url =
new URL(
"file://" + filename.substring(0, filename.lastIndexOf(
".")) +
"_tei.xml");
265 System.out.println(url.toString());
266 cio.
write(teiDoc, url);
267 System.out.println(
"document written.");
268 stats.
addCorrect(
function, cd,
"ISO TEI conversion of file was successful");
270 stats.
addCritical(
function, cd,
"ISO TEI conversion of file was not possible because of unknown error");
282 System.out.println((cd.
getURL()).getFile());
283 System.out.println(
"started writing document...");
285 HIATSegmentation segmentation =
new HIATSegmentation();
295 if (!FSM.equals(
"")) {
296 segmentation.pathToExternalFSM = FSM;
299 SegmentedTranscription st = segmentation.BasicToSegmented(bt);
300 System.out.println(
"Segmented transcription created");
307 String nameOfDeepSegmentation,
308 String nameOfFlatSegmentation,
309 boolean includeFullText,
CorpusData cd)
throws JDOMException, IOException, TransformerException, ParserConfigurationException, UnsupportedEncodingException, SAXException, XPathExpressionException, URISyntaxException {
311 Document finalDocument = null;
322 Document teiDocument = null;
329 if (result != null) {
332 System.out.println(
"STEP 1 completed.");
346 Vector uElements =
TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation, includeFullText);
348 XPath xp = XPath.newInstance(BODY_NODE);
349 BODY_NODE =
"//tei:body";
350 xp = XPath.newInstance(BODY_NODE);
351 xp.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
353 Element textNode = (Element) (xp.selectSingleNode(teiDocument));
354 textNode.addContent(uElements);
355 if (teiDocument != null) {
356 System.out.println(
"STEP 2 completed.");
358 Document transformedDocument = null;
364 transformedDocument = IOUtilities.readDocumentFromString(result2);
365 if (transformedDocument != null) {
367 textNode = (Element) (xp.selectSingleNode(transformedDocument));
368 System.out.println(
"STEP 3 completed.");
371 XPath xp2 = XPath.newInstance(
"//segmentation[@name='Event']/ats");
372 List events = xp2.selectNodes(segmentedTranscription);
373 for (
int pos = 0; pos < events.size(); pos++) {
374 Element exmaraldaEvent = (Element) (events.get(pos));
375 String category = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue(
"category");
377 String elementName =
"event";
378 if (category.equals(
"pause")) {
379 elementName =
"pause";
382 Element teiEvent =
new Element(elementName);
384 String speakerID = exmaraldaEvent.getParentElement().getParentElement().getAttributeValue(
"speaker");
385 if (speakerID != null) {
386 teiEvent.setAttribute(
"who", speakerID);
388 teiEvent.setAttribute(
"start", exmaraldaEvent.getAttributeValue(
"s"));
389 teiEvent.setAttribute(
"end", exmaraldaEvent.getAttributeValue(
"e"));
390 if (!category.equals(
"pause")) {
391 teiEvent.setAttribute(
"desc", exmaraldaEvent.getText());
392 teiEvent.setAttribute(
"type", category);
394 String duration = exmaraldaEvent.getText().replaceAll(
"\\(",
"").replaceAll(
"\\)",
"");
395 teiEvent.setAttribute(
"dur", duration);
397 textNode.addContent(teiEvent);
421 = xslt.
transform(result4, remove_time_stylesheet);
423 = xslt.
transform(result5, spans_2_attributes_stylesheet);
424 transformedDocument = IOUtilities.readDocumentFromString(result6);
428 generateWordIDs(transformedDocument);
430 if (transformedDocument != null) {
436 if (result3 != null) {
437 finalDocument = IOUtilities.readDocumentFromString(result3);
438 if (finalDocument != null) {
446 return finalDocument;
449 public static Vector
TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation) {
450 return TEIMerge(segmentedTranscription, nameOfDeepSegmentation, nameOfFlatSegmentation,
false);
471 public static Vector
TEIMerge(Document segmentedTranscription,
472 String nameOfDeepSegmentation,
473 String nameOfFlatSegmentation,
474 boolean includeFullText) {
478 Hashtable timelineItems =
new Hashtable();
479 String xpath =
"//tli";
480 XPath xpx = XPath.newInstance(xpath);
481 List tlis = xpx.selectNodes(segmentedTranscription);
482 for (
int pos = 0; pos < tlis.size(); pos++) {
484 timelineItems.put(((Element) (tlis.get(pos))).getAttributeValue(
"id"), pos);
487 Vector returnValue =
new Vector();
488 XPath xp1 = XPath.newInstance(
"//segmentation[@name='" + nameOfDeepSegmentation +
"']/ts");
489 List segmentChains = xp1.selectNodes(segmentedTranscription);
491 for (Object segmentChain : segmentChains) {
492 Element sc = (Element) (segmentChain);
493 sc.setAttribute(
"speaker", sc.getParentElement().getParentElement().getAttributeValue(
"speaker"));
494 String tierref = sc.getParentElement().getAttributeValue(
"tierref");
495 String start = sc.getAttributeValue(
"s");
496 String end = sc.getAttributeValue(
"e");
497 String xpath2 =
"//segmentation[@name='" + nameOfFlatSegmentation +
"' and @tierref='" + tierref +
"']" 498 +
"/ts[@s='" + start +
"' and @e='" + end +
"']";
499 XPath xp2 = XPath.newInstance(xpath2);
500 Element sc2 = (Element) (xp2.selectSingleNode(segmentedTranscription));
505 throw new Exception(tierref +
" " + start +
" " + end);
508 Element mergedElement = merge(sc, sc2);
511 int s = ((Integer) (timelineItems.get(start)));
512 int e = ((Integer) (timelineItems.get(end)));
515 String xpath3 =
"//segmentation[@name='" + nameOfFlatSegmentation +
"' and @tierref='" + tierref +
"']" 516 +
"/ts[@s='" + start +
"' and @e='" + end +
"']/ts";
517 XPath xp3 = XPath.newInstance(xpath3);
518 List transannos = xp3.selectNodes(segmentedTranscription);
519 for (Object transanno1 : transannos) {
520 Element transanno = (Element) transanno1;
521 String transaStart = transanno.getAttributeValue(
"s");
522 String transaEnd = transanno.getAttributeValue(
"e");
523 int transas = ((Integer) (timelineItems.get(transaStart)));
524 int transae = ((Integer) (timelineItems.get(transaEnd)));
525 boolean transannotationBelongsToThisElement = (transas >= s && transas <= e) || (transae >= s && transae <= e);
526 if (transannotationBelongsToThisElement) {
527 Element annotationsElement = mergedElement.getChild(
"annotations");
528 if (annotationsElement == null) {
529 annotationsElement =
new Element(
"annotations");
530 mergedElement.addContent(annotationsElement);
532 Element annotation =
new Element(
"annotation");
533 annotation.setAttribute(
"start", transaStart);
534 annotation.setAttribute(
"end", transaEnd);
535 annotation.setAttribute(
"level", transanno.getParentElement().getParentElement().getAttributeValue(
"name"));
536 annotation.setAttribute(
"value", transanno.getText());
537 annotationsElement.addContent(annotation);
542 String xpath5 =
"//segmented-tier[@id='" + tierref +
"']/annotation/ta";
543 XPath xp5 = XPath.newInstance(xpath5);
544 List annotations = xp5.selectNodes(segmentedTranscription);
545 for (Object annotation1 : annotations) {
546 Element anno = (Element) (annotation1);
547 String aStart = anno.getAttributeValue(
"s");
548 String aEnd = anno.getAttributeValue(
"e");
549 int as = ((Integer) (timelineItems.get(aStart)));
550 int ae = ((Integer) (timelineItems.get(aEnd)));
551 boolean annotationBelongsToThisElement = (as >= s && as <= e) || (ae >= s && ae <= e);
552 if (annotationBelongsToThisElement) {
553 Element annotationsElement = mergedElement.getChild(
"annotations");
554 if (annotationsElement == null) {
555 annotationsElement =
new Element(
"annotations");
556 mergedElement.addContent(annotationsElement);
558 Element annotation =
new Element(
"annotation");
559 annotation.setAttribute(
"start", aStart);
560 annotation.setAttribute(
"end", aEnd);
561 annotation.setAttribute(
"level", anno.getParentElement().getAttributeValue(
"name"));
562 annotation.setAttribute(
"value", anno.getText());
563 annotationsElement.addContent(annotation);
572 if (includeFullText) {
573 Element annotation =
new Element(
"annotation");
574 annotation.setAttribute(
"start", start);
575 annotation.setAttribute(
"end", end);
576 annotation.setAttribute(
"level",
"full-text");
578 String fullText =
"";
579 List l = XPath.selectNodes(sc2,
"descendant::text()");
581 Text text = (Text) o;
582 fullText += text.getText();
584 annotation.setAttribute(
"value", fullText);
586 Element annotationsElement = mergedElement.getChild(
"annotations");
587 if (annotationsElement == null) {
588 annotationsElement =
new Element(
"annotations");
589 mergedElement.addContent(annotationsElement);
591 annotationsElement.addContent(annotation);
595 returnValue.addElement(mergedElement.detach());
603 }
catch (JDOMException ex) {
604 ex.printStackTrace();
605 }
catch (Exception ex) {
606 ex.printStackTrace();
611 static Element merge(Element e1, Element e2) {
613 Iterator i1 = e1.getDescendants();
614 Vector pcData1 =
new Vector();
615 while (i1.hasNext()) {
616 pcData1.addElement(i1.next());
619 Iterator i2 = e2.getDescendants(
new TextFilter());
620 Vector pcData2 =
new Vector();
621 while (i2.hasNext()) {
622 pcData2.addElement(i2.next());
625 int charBoundary = 0;
626 for (
int pos = 0; pos < pcData2.size() - 1; pos++) {
627 Text eventText = (Text) (pcData2.elementAt(pos));
628 Element anchor =
new Element(
"anchor");
629 Element
event = eventText.getParentElement();
630 String start =
event.getAttributeValue(
"e");
631 anchor.setAttribute(
"synch", start);
633 charBoundary += eventText.getText().length();
637 for (
int pos2 = 0; pos2 < pcData1.size(); pos2++) {
638 Object o = pcData1.elementAt(pos2);
639 if (!(o instanceof Text)) {
642 Text segmentText = (Text) o;
643 int textLength = segmentText.getText().length();
644 if (charCount + textLength < charBoundary) {
645 charCount += textLength;
647 }
else if (charCount + textLength == charBoundary) {
648 Element parent = segmentText.getParentElement();
649 int index = parent.indexOf(segmentText);
650 Element parentOfParent = parent.getParentElement();
651 int index2 = parentOfParent.indexOf(parent);
652 parentOfParent.addContent(index2 + 1, anchor);
656 String leftPart = segmentText.getText().substring(0, charBoundary - charCount);
657 String rightPart = segmentText.getText().substring(charBoundary - charCount);
658 Text leftText =
new Text(leftPart);
659 Text rightText =
new Text(rightPart);
664 Element parent = segmentText.getParentElement();
665 parent.removeContent(segmentText);
666 parent.addContent(leftText);
667 parent.addContent(anchor);
668 parent.addContent(rightText);
670 pcData1.remove(segmentText);
671 pcData1.add(pos2, rightText);
672 pcData1.add(pos2, anchor);
673 pcData1.add(pos2, leftText);
684 private void generateWordIDs(Document document)
throws JDOMException {
686 HashSet<String> allExistingIDs =
new HashSet<String>();
687 XPath idXPath = XPath.newInstance(
"//tei:*[@xml:id]");
688 idXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
689 idXPath.addNamespace(Namespace.XML_NAMESPACE);
690 List idElements = idXPath.selectNodes(document);
691 for (Object o : idElements) {
692 Element e = (Element) o;
693 allExistingIDs.add(e.getAttributeValue(
"id", Namespace.XML_NAMESPACE));
697 XPath wordXPath = XPath.newInstance(
"//tei:w[not(@xml:id)]");
698 wordXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
699 wordXPath.addNamespace(Namespace.XML_NAMESPACE);
701 List words = wordXPath.selectNodes(document);
703 for (Object o : words) {
704 Element word = (Element) o;
705 while (allExistingIDs.contains(
"w" + Integer.toString(count))) {
709 String wordID =
"w" + Integer.toString(count);
710 allExistingIDs.add(wordID);
712 word.setAttribute(
"id", wordID, Namespace.XML_NAMESPACE);
716 XPath pcXPath = XPath.newInstance(
"//tei:pc[not(@xml:id)]");
717 pcXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
718 pcXPath.addNamespace(Namespace.XML_NAMESPACE);
720 List pcs = pcXPath.selectNodes(document);
722 for (Object o : pcs) {
723 Element pc = (Element) o;
724 while (allExistingIDs.contains(
"pc" + Integer.toString(count))) {
728 String pcID =
"pc" + Integer.toString(count);
729 allExistingIDs.add(pcID);
731 pc.setAttribute(
"id", pcID, Namespace.XML_NAMESPACE);
735 XPath incXPath = XPath.newInstance(
"//tei:event[not(@xml:id)]");
736 pcXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
737 pcXPath.addNamespace(Namespace.XML_NAMESPACE);
739 List incs = incXPath.selectNodes(document);
741 for (Object o : incs) {
742 Element pc = (Element) o;
743 while (allExistingIDs.contains(
"inc" + Integer.toString(count))) {
747 String incID =
"inc" + Integer.toString(count);
748 allExistingIDs.add(incID);
750 pc.setAttribute(
"id", incID, Namespace.XML_NAMESPACE);
754 XPath segXPath = XPath.newInstance(
"//tei:seg[not(@xml:id)]");
755 pcXPath.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
756 pcXPath.addNamespace(Namespace.XML_NAMESPACE);
758 List segs = segXPath.selectNodes(document);
760 for (Object o : segs) {
761 Element seg = (Element) o;
762 while (allExistingIDs.contains(
"seg" + Integer.toString(count))) {
766 String segID =
"seg" + Integer.toString(count);
767 allExistingIDs.add(segID);
769 seg.setAttribute(
"id", segID, Namespace.XML_NAMESPACE);
774 private void setDocLanguage(Document teiDoc, String language)
throws JDOMException {
776 XPath xpathToLangAttribute = XPath.newInstance(
"//tei:text/@xml:lang");
777 xpathToLangAttribute.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
778 xpathToLangAttribute.addNamespace(Namespace.XML_NAMESPACE);
779 Attribute langAtt = (Attribute) xpathToLangAttribute.selectSingleNode(teiDoc);
780 if (langAtt != null) {
781 langAtt.setValue(language);
783 XPath xpathToTextElement = XPath.newInstance(
"//tei:text");
784 xpathToTextElement.addNamespace(
"tei",
"http://www.tei-c.org/ns/1.0");
785 xpathToTextElement.addNamespace(Namespace.XML_NAMESPACE);
786 Element textEl = (Element) xpathToTextElement.selectSingleNode(teiDoc);
787 textEl.setAttribute(
"lang", language, Namespace.XML_NAMESPACE);
789 System.out.println(
"Language of document set to " + language);
812 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
817 }
catch (ClassNotFoundException ex) {
818 report.
addException(ex,
"unknown class not found error");
825 String description =
"This class takes an exb as input and converts it into ISO standard TEI format. ";
void setLanguage(String lang)
BasicTranscription getEXMARaLDAbt()
CorpusData readFileURL(URL url, Collection< Class<?extends CorpusData >> clcds)
Collection< Class<?extends CorpusData > > getIsUsableFor()
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText)
void setFSM(String newfsm)
void addCritical(String description)
Document cd2SegmentedTranscription(CorpusData cd)
Report convertEXB2MORPHEMEHIATISOTEI(CorpusData cd, boolean includeFullText, String XPath2Morphemes)
String readInternalResourceAsString(String path2resource)
void addCorrect(String statId, String description)
Report convertEXB2MORPHEMEHIATISOTEI(CorpusData cd)
static org.jdom.Document String2JdomDocument(String stringRespresentingDocument)
Document SegmentedTranscriptionToTEITranscription(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation, boolean includeFullText, CorpusData cd)
static String JdomDocument2String(org.jdom.Document jdomDocument)
void addException(Throwable e, String description)
void write(CorpusData cd, URL url)
static Vector TEIMerge(Document segmentedTranscription, String nameOfDeepSegmentation, String nameOfFlatSegmentation)
Report convertCOMA2MORPHEMEHIATISOTEI(CorpusData cd)