corpus-services  1.0
IAAFunctionality.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora.validation;
2 
9 import java.io.IOException;
10 import java.net.URISyntaxException;
11 import java.util.ArrayList;
12 import java.util.Collection;
13 import java.util.Collections;
14 import java.util.HashMap;
15 import java.util.List;
16 import javax.xml.parsers.DocumentBuilder;
17 import javax.xml.parsers.DocumentBuilderFactory;
18 import javax.xml.parsers.ParserConfigurationException;
19 import javax.xml.transform.TransformerException;
20 import javax.xml.xpath.XPathExpressionException;
21 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
22 import org.jdom.JDOMException;
23 import org.w3c.dom.Document;
24 import org.w3c.dom.Element;
25 import org.w3c.dom.NodeList;
26 import org.xml.sax.SAXException;
27 
37 public class IAAFunctionality extends Checker implements CorpusFunction {
38 
39  String annotLoc = "";
40  HashMap<String, HashMap<String, String>> annotations; // hash map for holding annotations of exb files
41  HashMap<String, Collection<String>> distinctAnnotations; // hash map for storing distinct annots for each transcription file
42  HashMap<String, HashMap<String, String>> annotationsTwo; // hash map for holding annotations of second exb files
43  HashMap<String, Integer> noOfSubCategories; // hash map for holding number of subcategories for every category
44  HashMap<String, String> subCategoryToCategory; // hash map for holding parent categories for sub categories
45  private int noOfAnnotations = 0; // total no of annotations
46  private int noOfDifferentAnnotations = 0; // total number of different annotations between different two different versions
47 
48  public IAAFunctionality() {
49  //no fixing available
50  super(false);
51  }
52 
59  @Override
60  public Report function(CorpusData cd, Boolean fix)
61  throws SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException {
62  Report stats = new Report(); //create a new report
63  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
64  DocumentBuilder db = dbf.newDocumentBuilder();
65  Document doc = db.parse(TypeConverter.String2InputStream(cd.toSaveableString())); // get the file as a document
66  String transcriptName;
67  if (doc.getElementsByTagName("transcription-name").getLength() > 0) { // check if transcript name exists for the exb file
68  transcriptName = doc.getElementsByTagName("transcription-name").item(0).getTextContent(); // get transcript name
69  } else {
70  transcriptName = "No Name Transcript";
71  }
72  NodeList tiers = doc.getElementsByTagName("tier"); // get all tiers of the transcript
73  //initialise the hash map only the first time when this function is called
74  if (annotations == null) {
75  annotations = new HashMap<>();
76  }
77  if (distinctAnnotations == null) {
78  distinctAnnotations = new HashMap<>();
79  }
80  //if annotations hash map doesn't contain the transcript's name it means
81  //that it is the first time a version of this file is encountered.
82  if (!annotations.containsKey(transcriptName)) {
83  Collection<String> c = new ArrayList<>(); // collection for adding annotations into hash map
84  HashMap<String, String> h = new HashMap<>();
85  for (int i = 0; i < tiers.getLength(); i++) { // loop for dealing with each tier
86  Element tier = (Element) tiers.item(i);
87  if (tier.getAttribute("type").equals("a") && !tier.getAttribute("category").equals("c")) { // if it is an annotation tier
88  NodeList events = tier.getElementsByTagName("event");
89  String tierID = tier.getAttribute("id");
90  for (int j = 0; j < events.getLength(); j++) { // annotation events
91  Element event = (Element) events.item(j);
92  String eventStart = event.getAttribute("start");
93  String eventEnd = event.getAttribute("end");
94  if(!c.contains(event.getTextContent())) // if annot not already added to the list
95  c.add(event.getTextContent());
96  String key = tierID+"-"+eventStart+"-"+eventEnd;
97  h.put(key, event.getTextContent());
98  }
99  }
100  }
101  if(!h.isEmpty())
102  annotations.put(transcriptName, h); // finally add the annotations of the transcript
103  if(!c.isEmpty())
104  distinctAnnotations.put(transcriptName, c);
105  } else { // another version of this transcript has already been encountered
106  //initialise the hash map only the first time when another version of any transcript is encountered
107  if (annotationsTwo == null) {
108  annotationsTwo = new HashMap<>();
109  }
110  int annotationCounter = 0; // counter for number of annotations
111  noOfDifferentAnnotations = 0;
112  HashMap<String, String> h = new HashMap<>();
113  for (int i = 0; i < tiers.getLength(); i++) { // loop for dealing with each tier
114  Element tier = (Element) tiers.item(i);
115  HashMap map = new HashMap(annotations.get(transcriptName));
116  if (tier.getAttribute("type").equals("a") && !tier.getAttribute("category").equals("c")) { // if it is an annotation tier
117  NodeList events = tier.getElementsByTagName("event");
118  String tierID = tier.getAttribute("id");
119  for (int j = 0; j < events.getLength(); j++) {
120  Element event = (Element) events.item(j);
121  String eventStart = event.getAttribute("start");
122  String eventEnd = event.getAttribute("end");
123  String key = tierID+"-"+eventStart+"-"+eventEnd;
124  h.put(key, event.getTextContent());
125  annotationCounter++;
126  // check if the event's annotation in one version is same with the same event's annotation in the other version
127  if (map.containsKey(key)) {
128  if (!map.get(key).equals(event.getTextContent())) {
129  stats.addWarning("iaa-functionality", "Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
130  + " is containing a different annotation for the same event (" + eventStart
131  + ") in its tier " + tierID + " from another version of the same file! This version "
132  + "has the annotation: " + event.getTextContent() + ", while the other version has the annotation: "
133  + map.get(key));
134  exmaError.addError("iaa-functionality", cd.getURL().getFile(), tierID, eventStart, false,
135  "Exb file " + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
136  + " is containing a different annotation for the same event (" + eventStart
137  + ") in its tier " + tierID + " from another version of the same file! This version "
138  + "has the annotation: " + event.getTextContent() + ", while the other version has the annotation: "
139  + map.get(key));
140  noOfDifferentAnnotations++; // increase the counter for number of different annotations
141  }
142  }else{
143  noOfDifferentAnnotations++;
144  }
145  }
146  }
147  }
148  if(!h.isEmpty())
149  annotationsTwo.put(transcriptName, h); // finally add the annotations of the transcript
150  List list = new ArrayList(distinctAnnotations.get(transcriptName));
151  float dE = 0; // expected disagreement for Krippendorff's alpha
152  noOfAnnotations = annotationCounter;
153  int partOfDenominator = noOfAnnotations*2; //noOfItems*noOfCoders (number of coders = 2 since there are two versions(annotators))
154  for (Object event : list) { // go through every distinct annotation
155  String ev = (String) event;
156  int eventOccurrence = Collections.frequency(annotations.get(transcriptName).values(), ev);
157  int eventOccurrenceTwo = Collections.frequency(annotationsTwo.get(transcriptName).values(), ev);
158  int totalFirstEv = eventOccurrence + eventOccurrenceTwo;
159  for (Object eventIn : list){
160  String evIn = (String) eventIn;
161  // if it is the same annotation with the upper loop skip to the next one
162  // as the distance is 0 - that cancels out the other operations down there.
163  if(ev.equals(evIn))
164  continue;
165  int secEventOccurrence = Collections.frequency(annotations.get(transcriptName).values(), evIn);
166  int secEventOccurrenceTwo = Collections.frequency(annotationsTwo.get(transcriptName).values(), evIn);
167  int totalSecEv = secEventOccurrence + secEventOccurrenceTwo;
168  dE = dE + (totalFirstEv * totalSecEv) / (float) (partOfDenominator * (partOfDenominator-1));
169  }
170  }
171  float iaa = (noOfAnnotations - noOfDifferentAnnotations) / (float) noOfAnnotations; // inter-annotator measure
172  float dZero = noOfDifferentAnnotations / (float) noOfAnnotations; // observed disagreement for Krippendorff's alpha
173  float alpha = 1 - ((dZero)/(float)(dE)); // Krippendorff's alpha
174  System.out.println("The percentage of overlapping annotations between two versions of "
175  + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1) + " is " + 100 * iaa + "%");
176  System.out.println("Inter annotator agreement between two versions of "
177  + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
178  + " according to Krippendorff's alpha is " + alpha);
179  stats.addNote("iaa-functionality", "The percentage of overlapping annotations between two versions of "
180  + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1) + " is " + 100 * iaa + "%");
181  stats.addNote("iaa-functionality", "Inter annotator agreement between two versions of "
182  + cd.getURL().getFile().substring(cd.getURL().getFile().lastIndexOf("/") + 1)
183  + " according to Krippendorff's alpha is " + alpha);
184  }
185  return stats;
186  }
187 
193  @Override
194  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
195  try {
196  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
197  IsUsableFor.add(cl);
198  } catch (ClassNotFoundException ex) {
199  report.addException(ex, " usable class not found");
200  }
201  return IsUsableFor;
202  }
203 
207  @Override
208  public String getDescription() {
209  String description = "This class calculates IAA according to Krippendorff's"
210  + " alpha for exb files; only cares for annotation labels, assuming"
211  + " that transcription structure and text remains the same. Checks"
212  + " and puts them in the error lists if different versions of the"
213  + " same file have different annotations for the same event/token."
214  + " Moreover, this functionality includes the inter-annotator agreement:"
215  + " percentage of overlapping choices between the annotators.";
216  return description;
217  }
218 
219  @Override
220  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException, JexmaraldaException {
221  Report stats = new Report();
222  for (CorpusData cdata : c.getBasicTranscriptionData()) {
223  stats.merge(function(cdata, fix));
224  }
225  return stats;
226  }
227 
228 }
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addNote(String statId, String description)
Definition: Report.java:245
void merge(Report sr)
Definition: Report.java:73
void addWarning(String statId, String description)
Definition: Report.java:164
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)
Definition: Report.java:287