1 package de.uni_hamburg.corpora.validation;
9 import java.io.IOException;
10 import java.net.URISyntaxException;
11 import java.util.ArrayList;
12 import java.util.Collection;
13 import java.util.Collections;
14 import java.util.HashMap;
15 import java.util.List;
16 import javax.xml.parsers.DocumentBuilder;
17 import javax.xml.parsers.DocumentBuilderFactory;
18 import javax.xml.parsers.ParserConfigurationException;
19 import javax.xml.transform.TransformerException;
20 import javax.xml.xpath.XPathExpressionException;
21 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
22 import org.jdom.JDOMException;
23 import org.w3c.dom.Document;
24 import org.w3c.dom.Element;
25 import org.w3c.dom.NodeList;
26 import org.xml.sax.SAXException;
40 HashMap<String, HashMap<String, String>> annotations;
41 HashMap<String, Collection<String>> distinctAnnotations;
42 HashMap<String, HashMap<String, String>> annotationsTwo;
43 HashMap<String, Integer> noOfSubCategories;
44 HashMap<String, String> subCategoryToCategory;
45 private int noOfAnnotations = 0;
46 private int noOfDifferentAnnotations = 0;
61 throws SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException {
63 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
64 DocumentBuilder db = dbf.newDocumentBuilder();
66 String transcriptName;
67 if (doc.getElementsByTagName(
"transcription-name").getLength() > 0) {
68 transcriptName = doc.getElementsByTagName(
"transcription-name").item(0).getTextContent();
70 transcriptName =
"No Name Transcript";
72 NodeList tiers = doc.getElementsByTagName(
"tier");
74 if (annotations == null) {
75 annotations =
new HashMap<>();
77 if (distinctAnnotations == null) {
78 distinctAnnotations =
new HashMap<>();
82 if (!annotations.containsKey(transcriptName)) {
83 Collection<String> c =
new ArrayList<>();
84 HashMap<String, String> h =
new HashMap<>();
85 for (
int i = 0; i < tiers.getLength(); i++) {
86 Element tier = (Element) tiers.item(i);
87 if (tier.getAttribute(
"type").equals(
"a") && !tier.getAttribute(
"category").equals(
"c")) {
88 NodeList events = tier.getElementsByTagName(
"event");
89 String tierID = tier.getAttribute(
"id");
90 for (
int j = 0; j < events.getLength(); j++) {
91 Element
event = (Element) events.item(j);
92 String eventStart =
event.getAttribute(
"start");
93 String eventEnd =
event.getAttribute(
"end");
94 if(!c.contains(event.getTextContent()))
95 c.add(event.getTextContent());
96 String key = tierID+
"-"+eventStart+
"-"+eventEnd;
97 h.put(key, event.getTextContent());
102 annotations.put(transcriptName, h);
104 distinctAnnotations.put(transcriptName, c);
107 if (annotationsTwo == null) {
108 annotationsTwo =
new HashMap<>();
110 int annotationCounter = 0;
111 noOfDifferentAnnotations = 0;
112 HashMap<String, String> h =
new HashMap<>();
113 for (
int i = 0; i < tiers.getLength(); i++) {
114 Element tier = (Element) tiers.item(i);
115 HashMap map =
new HashMap(annotations.get(transcriptName));
116 if (tier.getAttribute(
"type").equals(
"a") && !tier.getAttribute(
"category").equals(
"c")) {
117 NodeList events = tier.getElementsByTagName(
"event");
118 String tierID = tier.getAttribute(
"id");
119 for (
int j = 0; j < events.getLength(); j++) {
120 Element
event = (Element) events.item(j);
121 String eventStart =
event.getAttribute(
"start");
122 String eventEnd =
event.getAttribute(
"end");
123 String key = tierID+
"-"+eventStart+
"-"+eventEnd;
124 h.put(key, event.getTextContent());
127 if (map.containsKey(key)) {
128 if (!map.get(key).equals(event.getTextContent())) {
129 stats.
addWarning(
"iaa-functionality",
"Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf(
"/") + 1)
130 +
" is containing a different annotation for the same event (" + eventStart
131 +
") in its tier " + tierID +
" from another version of the same file! This version " 132 +
"has the annotation: " + event.getTextContent() +
", while the other version has the annotation: " 134 exmaError.addError(
"iaa-functionality", cd.getURL().getFile(), tierID, eventStart,
false,
135 "Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf(
"/") + 1)
136 +
" is containing a different annotation for the same event (" + eventStart
137 +
") in its tier " + tierID +
" from another version of the same file! This version " 138 +
"has the annotation: " + event.getTextContent() +
", while the other version has the annotation: " 140 noOfDifferentAnnotations++;
143 noOfDifferentAnnotations++;
149 annotationsTwo.put(transcriptName, h);
150 List list =
new ArrayList(distinctAnnotations.get(transcriptName));
152 noOfAnnotations = annotationCounter;
153 int partOfDenominator = noOfAnnotations*2;
154 for (Object event : list) {
155 String ev = (String) event;
156 int eventOccurrence = Collections.frequency(annotations.get(transcriptName).values(), ev);
157 int eventOccurrenceTwo = Collections.frequency(annotationsTwo.get(transcriptName).values(), ev);
158 int totalFirstEv = eventOccurrence + eventOccurrenceTwo;
159 for (Object eventIn : list){
160 String evIn = (String) eventIn;
165 int secEventOccurrence = Collections.frequency(annotations.get(transcriptName).values(), evIn);
166 int secEventOccurrenceTwo = Collections.frequency(annotationsTwo.get(transcriptName).values(), evIn);
167 int totalSecEv = secEventOccurrence + secEventOccurrenceTwo;
168 dE = dE + (totalFirstEv * totalSecEv) / (
float) (partOfDenominator * (partOfDenominator-1));
171 float iaa = (noOfAnnotations - noOfDifferentAnnotations) / (
float) noOfAnnotations;
172 float dZero = noOfDifferentAnnotations / (float) noOfAnnotations;
173 float alpha = 1 - ((dZero)/(
float)(dE));
174 System.out.println(
"The percentage of overlapping annotations between two versions of " 175 + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf(
"/") + 1) +
" is " + 100 * iaa +
"%");
176 System.out.println(
"Inter annotator agreement between two versions of " 177 + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf(
"/") + 1)
178 +
" according to Krippendorff's alpha is " + alpha);
179 stats.
addNote(
"iaa-functionality",
"The percentage of overlapping annotations between two versions of " 180 + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf(
"/") + 1) +
" is " + 100 * iaa +
"%");
181 stats.
addNote(
"iaa-functionality",
"Inter annotator agreement between two versions of " 182 + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf(
"/") + 1)
183 +
" according to Krippendorff's alpha is " + alpha);
196 Class cl = Class.forName(
"de.uni_hamburg.corpora.BasicTranscriptionData");
198 }
catch (ClassNotFoundException ex) {
209 String description =
"This class calculates IAA according to Krippendorff's" 210 +
" alpha for exb files; only cares for annotation labels, assuming" 211 +
" that transcription structure and text remains the same. Checks" 212 +
" and puts them in the error lists if different versions of the" 213 +
" same file have different annotations for the same event/token." 214 +
" Moreover, this functionality includes the inter-annotator agreement:" 215 +
" percentage of overlapping choices between the annotators.";
220 public Report function(
Corpus c, Boolean fix)
throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException, JexmaraldaException {
222 for (
CorpusData cdata : c.getBasicTranscriptionData()) {
223 stats.
merge(
function(cdata, fix));
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addNote(String statId, String description)
static ExmaErrorList exmaError
void addWarning(String statId, String description)
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)