corpus-services  1.0
ComaKmlForLocations.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora.validation;
2 
9 import java.io.IOException;
10 import java.net.URISyntaxException;
11 import java.net.URL;
12 import java.nio.file.Paths;
13 import java.util.Collection;
14 import java.util.HashMap;
15 import java.util.logging.Level;
16 import java.util.logging.Logger;
17 import javax.xml.parsers.DocumentBuilder;
18 import javax.xml.parsers.DocumentBuilderFactory;
19 import javax.xml.parsers.ParserConfigurationException;
20 import javax.xml.transform.TransformerConfigurationException;
21 import javax.xml.transform.TransformerException;
22 import javax.xml.xpath.XPathExpressionException;
23 import org.jdom.JDOMException;
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Element;
26 import org.w3c.dom.NodeList;
27 import org.xml.sax.SAXException;
28 
34 public class ComaKmlForLocations extends Checker implements CorpusFunction {
35 
36  String comaLoc = "";
37  String kmlFile;
38  HashMap<String, String> birthPlace; // hash map for holding the birthplaces of speakers
39  HashMap<String, String> domicile; // hash map for storing the residences of speakers
40  HashMap<String, String> commLocation; // hash map for holding locations where the communications took place
41  HashMap<String, String> lngLat; // hash map for holding coordinates of locations
42 
43  final String KEYBIRTHPLACE = "1a Place of birth";
44  final String KEYBIRTHPLACELL = "1c Place of birth (LngLat)";
45  final String KEYREGION = "2 Region";
46  final String KEYCOUNTRY = "3 Country";
47  final String KEYDOMICILE = "7a Domicile";
48  final String KEYDOMICILELL = "7c Domicile (LngLat)";
49  final String KEYOTHER = "8a Other information";
50  final String KEYCOUNTRYBARE = "Country";
51  final String KEYREGIONBARE = "Region";
52  final String KEYSETTLEMENT = "Settlement";
53  final String KEYSETTLEMENTLL = "Settlement (LngLat)";
54 
56  //fix available
57  super(true);
58  }
59 
63  @Override
64  public Report function(CorpusData cd, Boolean fix)
65  throws SAXException, IOException, ParserConfigurationException, URISyntaxException, TransformerConfigurationException, TransformerException, XPathExpressionException {
66  Report stats = new Report();
67  try {
68 
69  if (kmlFile != null) {
70  stats = getCoordinates();
71  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
72  DocumentBuilder db = dbf.newDocumentBuilder();
73  Document doc = db.parse(TypeConverter.String2InputStream(cd.toSaveableString())); // get the file as a document
74  NodeList communications = doc.getElementsByTagName("Communication"); // get all the communications in the corpus
75  NodeList speakers = doc.getElementsByTagName("Speaker"); // get all the speakers in the corpus
76 
77  if (birthPlace == null) {
78  birthPlace = new HashMap<>();
79  }
80  if (domicile == null) {
81  domicile = new HashMap<>();
82  }
83  if (commLocation == null) {
84  commLocation = new HashMap<>();
85  }
86  for (int i = 0; i < speakers.getLength(); i++) { //iterate through speakers
87  Element speaker = (Element) speakers.item(i);
88  Element sigle = (Element) speaker.getElementsByTagName("Sigle").item(0);
89  String sigleString = sigle.getTextContent();
90  NodeList locations = speaker.getElementsByTagName("Location");
91  String languageCode = speaker.getElementsByTagName("LanguageCode").item(0).getTextContent();
92  for (int j = 0; j < locations.getLength(); j++) {
93  Element location = (Element) locations.item(j);
94  if (location.getAttribute("Type").equals("Basic biogr. data")) {
95  NodeList keys = location.getElementsByTagName("Key");
96  String placeOfBirth = "";
97  String region = null;
98  String country = null;
99  String domicileStr = "";
100  boolean coorFlag = false;
101  boolean domCoor = false;
102  Element ref = null;
103  Element domRef = null;
104  for (int k = 0; k < keys.getLength(); k++) {
105  Element key = (Element) keys.item(k);
106  switch (key.getAttribute("Name")) {
107  case KEYBIRTHPLACE:
108  placeOfBirth = key.getTextContent();
109  break;
110  case KEYBIRTHPLACELL:
111  coorFlag = true;
112  break;
113  case KEYREGION:
114  region = key.getTextContent();
115  ref = key;
116  break;
117  case KEYCOUNTRY:
118  country = key.getTextContent();
119  break;
120  case KEYDOMICILE:
121  domicileStr = key.getTextContent();
122  break;
123  case KEYDOMICILELL:
124  domCoor = true;
125  break;
126  case KEYOTHER:
127  domRef = key;
128  break;
129  }
130  }
131  if (!placeOfBirth.equals("...") && !placeOfBirth.equals("")) {
132  if (placeOfBirth.endsWith("(?)")) {
133  placeOfBirth = placeOfBirth.substring(0, placeOfBirth.indexOf(" (?)"));
134  }
135  if (placeOfBirth.endsWith("`")) {
136  placeOfBirth = placeOfBirth.substring(0, placeOfBirth.indexOf("`"));
137  }
138 
139  // test for existing coordinates
140  String coordinates = "";
141  if(lngLat.containsKey(domicileStr + "-" + languageCode)){
142  coordinates = lngLat.get(domicileStr + "-" + languageCode);
143  } else if(lngLat.containsKey(domicileStr + "-")){
144  coordinates = lngLat.get(domicileStr + "-");
145  }
146 
147  if (coorFlag == false && (!coordinates.equals(""))) {
148  Element coordinatesKey = doc.createElement("Key");
149  coordinatesKey.setAttribute("Name", KEYBIRTHPLACELL);
150  coordinatesKey.setTextContent(coordinates);
151  Element loc = (Element) location.getElementsByTagName("Description").item(0);
152  loc.insertBefore(coordinatesKey, ref);
153  String message = "Added Key " + KEYBIRTHPLACELL + ": " + coordinatesKey + ") from KML (" + kmlFile + ") " + domicileStr + "' "
154  + "for speaker '" + sigleString + "'";
155  stats.addFix(function, cd, message);
156  } else if (!lngLat.containsKey(placeOfBirth + "-" + languageCode)) {
157  String message = "KML (" + kmlFile + ") does not contain the birthplace '" + placeOfBirth + "' "
158  + "from speaker '" + sigleString + "'";
159  System.out.println(message);
160  stats.addWarning(function, cd, message);
161  }
162  }
163 
164  if (!domicileStr.equals("...") && !domicileStr.equals("")) {
165  if (domicileStr.endsWith("(?)")) {
166  domicileStr = domicileStr.substring(0, domicileStr.indexOf(" (?)"));
167  }
168  if (domicileStr.endsWith("`")) {
169  domicileStr = domicileStr.substring(0, domicileStr.indexOf("`"));
170  }
171 
172  // test for existing coordinates
173  String coordinates = "";
174  if(lngLat.containsKey(domicileStr + "-" + languageCode)){
175  coordinates = lngLat.get(domicileStr + "-" + languageCode);
176  } else if(lngLat.containsKey(domicileStr + "-")){
177  coordinates = lngLat.get(domicileStr + "-");
178  }
179 
180  if (domCoor == false && (!coordinates.equals(""))) {
181  Element coordinatesKey = doc.createElement("Key");
182  coordinatesKey.setAttribute("Name", KEYDOMICILELL);
183  coordinatesKey.setTextContent(coordinates);
184  Element loc = (Element) location.getElementsByTagName("Description").item(0);
185  loc.insertBefore(coordinatesKey, domRef);
186  String message = "Added Key " + KEYDOMICILELL + ": " + coordinatesKey + ") from KML (" + kmlFile + ") " + domicileStr + "' "
187  + "for speaker '" + sigleString + "'";
188  stats.addFix(function, cd, message);
189  } else if (!lngLat.containsKey(domicileStr + "-" + languageCode)) {
190  String message = "KML (" + kmlFile + ") does not contain the domicile '" + domicileStr + "' "
191  + "from speaker '" + sigleString + "'";
192  System.out.println(message);
193  stats.addWarning(function, cd, message);
194  }
195  }
196  birthPlace.put(sigleString, new String(placeOfBirth + ", " + region + ", " + country));
197  domicile.put(sigleString, domicileStr);
198  break;
199  }
200  }
201  }
202  for (int i = 0; i < communications.getLength(); i++) { //iterate through communications
203  Element communication = (Element) communications.item(i);
204  Element location = (Element) communication.getElementsByTagName("Location").item(0); // get the location of the communication
205  String communicationID = communication.getAttribute("Id"); // get communication id
206  String communicationName = communication.getAttribute("Name"); // get communication name
207  NodeList keys = location.getElementsByTagName("Key");
208  String country = "";
209  String region = "";
210  String settlement = "";
211  String languageCode = communication.getElementsByTagName("LanguageCode").item(0).getTextContent();
212  boolean coorFlag = false;
213  for (int j = 0; j < keys.getLength(); j++) {
214  Element key = (Element) keys.item(j);
215  switch (key.getAttribute("Name")) {
216  case KEYCOUNTRYBARE:
217  country = key.getTextContent();
218  break;
219  case KEYREGIONBARE:
220  region = key.getTextContent();
221  break;
222  case KEYSETTLEMENT:
223  settlement = key.getTextContent();
224  break;
225  case KEYSETTLEMENTLL:
226  coorFlag = true;
227  break;
228  }
229  }
230  if (!settlement.equals("...") && !settlement.equals("")) {
231  if (settlement.endsWith("(?)")) {
232  settlement = settlement.substring(0, settlement.indexOf(" (?)"));
233  }
234  if (settlement.endsWith("`")) {
235  settlement = settlement.substring(0, settlement.indexOf("`"));
236  }
237 
238  // test for existing coordinates
239  String coordinates = "";
240  if(lngLat.containsKey(settlement + "-" + languageCode)){
241  coordinates = lngLat.get(settlement + "-" + languageCode);
242  } else if(lngLat.containsKey(settlement + "-")){
243  coordinates = lngLat.get(settlement + "-");
244  }
245 
246  if (coorFlag == false && (!coordinates.equals(""))) {
247  Element coordinatesKey = doc.createElement("Key");
248  coordinatesKey.setAttribute("Name", KEYSETTLEMENTLL);
249  coordinatesKey.setTextContent(coordinates);
250  Element loc = (Element) location.getElementsByTagName("Description").item(0);
251  loc.appendChild(coordinatesKey);
252  String message = "Added Key " + KEYSETTLEMENTLL + ": " + coordinatesKey + ") from KML (" + kmlFile + ") "
253  + "for communication '" + communicationName + "'";
254  stats.addFix(function, cd, message);
255  } else if (!lngLat.containsKey(settlement + "-" + languageCode)) {
256  String message = "KML (" + kmlFile + ") does not contain the settlement '" + settlement + "' "
257  + "from communication '" + communicationName + "'";
258  System.out.println(message);
259  stats.addWarning(function, cd, message);
260  }
261  }
262  commLocation.put(communicationID, new String(settlement + ", " + region + ", " + country));
263  }
264  if (fix) {
265  CorpusIO cio = new CorpusIO();
266  cd.updateUnformattedString(TypeConverter.W3cDocument2String(doc));
267  cio.write(cd, cd.getURL());
268  }
269  } else {
270  stats.addCritical(function, "No KML file path supplied");
271  }
272  } catch (JDOMException ex) {
273  Logger.getLogger(ComaKmlForLocations.class.getName()).log(Level.SEVERE, null, ex);
274  } catch (ParserConfigurationException ex) {
275  stats.addException(ex, function, cd, "The KML file could not be parsed.");
276  } catch (TransformerException ex) {
277  stats.addException(ex, function, cd, "Unknown Transformer error.");
278  } catch (XPathExpressionException ex) {
279  stats.addException(ex, function, cd, "Unknown XPath error.");
280  } catch (IOException ex) {
281  stats.addException(ex, function, cd, "The KML file could not be parsed.");
282  } catch (URISyntaxException ex) {
283  stats.addException(ex, function, cd, "URI syntax Exception.");
284  }
285  return stats; // return the report with warnings
286  }
287 
288  // sets the KML file path which is provided as input
289  public void setKMLFilePath(String path) {
290  this.kmlFile = path;
291  }
292 
293  // the method for getting coordinates of locations in the kml file
294  public Report getCoordinates() throws ParserConfigurationException, SAXException, IOException, JDOMException, URISyntaxException {
295  Report stats = new Report();
296  Document doc = null;
297  CorpusIO cio = new CorpusIO();
298  if (kmlFile != null) {
299  URL url = Paths.get(kmlFile).toUri().toURL();
300  String kmlString = cio.readExternalResourceAsString(url.toString());
301  if (kmlString != null) {
302  doc = TypeConverter.String2W3cDocument(kmlString);
303  if (lngLat == null) {
304  lngLat = new HashMap<>();
305  }
306  if (doc != null) {
307  NodeList placeMarks = doc.getElementsByTagName("Placemark");
308  for (int i = 0; i < placeMarks.getLength(); i++) { //iterate through place marks
309  Element placeMark = (Element) placeMarks.item(i);
310  Element name = (Element) placeMark.getElementsByTagName("name").item(0);
311  String nameOfPlace = name.getTextContent();
312  String language = "";
313  NodeList data = placeMark.getElementsByTagName("Data");
314  for (int j = 0; j < data.getLength(); j++) {
315  Element datum = (Element) data.item(j);
316  if (datum.getAttribute("name").equals("lang")) {
317  Element value = (Element) datum.getElementsByTagName("value").item(0);
318  language = value.getTextContent();
319  }
320  }
321  String coordinatesWithAltitude = placeMark.getElementsByTagName("coordinates").item(0).getTextContent();
322  String coordinate = coordinatesWithAltitude.trim().substring(0, coordinatesWithAltitude.trim().lastIndexOf(","));
323  lngLat.put(nameOfPlace + "-" + language, coordinate);
324  }
325  } else {
326  stats.addCritical(function, "KML file cannot be read");
327  }
328  } else {
329  stats.addCritical(function, "KML file cannot be read");
330  }
331  } else {
332  stats.addCritical(function, "No KML file path supplied");
333  }
334  return stats;
335  }
336 
342  @Override
343  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
344  try {
345  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
346  IsUsableFor.add(cl);
347  } catch (ClassNotFoundException ex) {
348  report.addException(ex, " usable class not found");
349  }
350  return IsUsableFor;
351  }
352 
357  @Override
358  public String getDescription() {
359  String description = "This class identifies and lists fields which contain"
360  + " location information; creates a list of different location names;"
361  + " gets geo-coordinates for the location names via Google API.";
362  return description;
363  }
364 
365  @Override
366  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, TransformerException, TransformerConfigurationException, XPathExpressionException {
367  Report stats = new Report();
368  cd = c.getComaData();
369  stats = function(cd, fix);
370  return stats;
371  }
372 
373 }
static org.w3c.dom.Document String2W3cDocument(String stringRespresentingDocument)
String readExternalResourceAsString(String path2resource)
Definition: CorpusIO.java:201
void addCritical(String description)
Definition: Report.java:104
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addWarning(String statId, String description)
Definition: Report.java:164
static String W3cDocument2String(org.w3c.dom.Document doc)
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155