corpus-services  1.0
VikusViewer.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.visualization;
7 
12 import java.io.IOException;
13 import java.net.URISyntaxException;
14 import java.security.NoSuchAlgorithmException;
15 import java.util.Collection;
16 import javax.xml.parsers.ParserConfigurationException;
17 import javax.xml.transform.TransformerException;
18 import javax.xml.xpath.XPathExpressionException;
19 import org.exmaralda.partitureditor.fsm.FSMException;
20 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
21 import org.jdom.JDOMException;
22 import org.xml.sax.SAXException;
23 import com.google.gson.Gson;
24 import com.google.gson.GsonBuilder;
25 import com.google.gson.JsonElement;
26 import com.google.gson.JsonObject;
27 import com.google.gson.JsonParser;
28 import com.opencsv.CSVReader;
30 import java.io.File;
31 import java.io.FileNotFoundException;
32 import java.io.FileReader;
33 import java.io.InputStreamReader;
34 import java.net.URL;
35 import java.util.ArrayList;
36 import java.util.Collections;
37 import java.util.List;
38 import org.jdom.Attribute;
39 import org.jdom.Element;
40 import org.jdom.xpath.XPath;
41 
46 public class VikusViewer extends Visualizer {
47 
48  private static final String CONFIG_PATH = "/vikus-viewer/config.json";
49  private static final String DATA_PATH = "/vikus-viewer/data.csv";
50  private static final String INFO_PATH = "/vikus-viewer/info.md";
51  private static final String TIMELINE_PATH = "/vikus-viewer/timeline.csv";
52  private static final String AUDIO_IMAGE_PATH = "/vikus-viewer/sound.jpg";
53  ArrayList<String> keywordblacklist = new ArrayList<>();
54  URL vikusviewerurl;
55  String licence;
56  String version;
57  String corpusPrefix;
58  String title;
59  String description;
60  ArrayList<String> allDistinctYears = new ArrayList<>();
61 
62  @Override
63  public Report function(CorpusData cd) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
64  Report stats = new Report();
65  ComaData coma = (ComaData) cd;
67  vikusviewerurl = new URL(cd.getParentURL() + "resources/vikus-viewer");
68  File vikusviewerfolder = new File((vikusviewerurl).getFile());
69  if (!vikusviewerfolder.exists()) {
70  //the curation folder it not there and needs to be created
71  vikusviewerfolder.mkdirs();
72  }
73 
74  Element comadescription = coma.getCorpusDescription();
75  Element descriptioncoma = (Element) XPath.selectSingleNode(comadescription, "descendant::Key[@Name='DC:description']");
76  description = descriptioncoma.getText();
77  Element elcorpusPrefix = (Element) XPath.selectSingleNode(comadescription, "descendant::Key[@Name='hzsk:corpusPrefix']");
78  corpusPrefix = elcorpusPrefix.getText();
79  Element eltitle = (Element) XPath.selectSingleNode(comadescription, "descendant::Key[@Name='DC:title']");
80  title = eltitle.getText();
81  Element elversion = (Element) XPath.selectSingleNode(comadescription, "descendant::Key[@Name='hzsk:corpusVersion']");
82  version = elversion.getText();
83  Element ellicence = (Element) XPath.selectSingleNode(comadescription, "descendant::Key[@Name='DC:rights']");
84  licence = ellicence.getText();
85  stats.merge(createDataCSV(cd));
86  stats.merge(createConfigJSON(cd));
87  stats.merge(createInfoMD(cd));
88  stats.merge(createTimelineCSV(cd));
89  return stats;
90  }
91 
92  public void keywordBlacklist() {
93  keywordblacklist.add("and");
94  keywordblacklist.add("a");
95  keywordblacklist.add("the");
96  keywordblacklist.add("i");
97  keywordblacklist.add("in");
98  keywordblacklist.add("are");
99  keywordblacklist.add("is");
100  keywordblacklist.add("how");
101  keywordblacklist.add("an");
102  keywordblacklist.add("on");
103  keywordblacklist.add("of");
104  keywordblacklist.add("my");
105  keywordblacklist.add("with");
106  keywordblacklist.add("at");
107  keywordblacklist.add("...");
108  }
109 
110  public Report createDataCSV(CorpusData cd) throws FileNotFoundException, IOException, JDOMException {
111  //id,keywords,year,_dialect,_country,_region,_settlement,_language,_speaker,_transcription,_scorehtml,_listhtml,_pdf,_audio,_genre,_description
112  //"sketch,drawing",1890,Ket,Russia,Tomsk Oblast,sel,https://corpora.uni-hamburg.de/hzsk/de/islandora/object/transcript:selkup-0.1_AR_1965_RestlessNight_transl/datastream/EXB/AR_1965_RestlessNight_transl.exb,https://corpora.uni-hamburg.de/hzsk/de/islandora/object/file:selkup-0.1_KFN_1965_BearHunting1_nar/datastream/PDF/KFN_1965_BearHunting1_nar.pdf,https://corpora.uni-hamburg.de/hzsk/de/islandora/object/recording:selkup-0.1_DN_196X_Bread_nar/datastream/MP3/DN_196X_Bread_nar.mp3,flk,Male Torso,KAI_1965_OldWitch_flk
113  Report stats = new Report();
114  CSVReader reader;
115  CorpusIO cio = new CorpusIO();
116  reader = new CSVReader(new InputStreamReader(getClass().getResourceAsStream(DATA_PATH)), ',');
117  List<String[]> data = reader.readAll();
118  //create Row ForCommunications
119  ComaData coma = (ComaData) cd;
120  String transrepourl = "https://corpora.uni-hamburg.de/repository/transcript:" + corpusPrefix + "-" + version + "_";
121  String filerepourl = "https://corpora.uni-hamburg.de/repository/file:" + corpusPrefix + "-" + version + "_";
122  String recrepourl = "https://corpora.uni-hamburg.de/repository/recording:" + corpusPrefix + "-" + version + "_";
123  for (Element communication : coma.getCommunications()) {
124  String[] comrow = new String[16];
125  //id
126  Attribute id = (Attribute) XPath.selectSingleNode(communication, "@Name");
127  comrow[0] = id.getValue();
128  //keyword - year, genre, Title splitted by spaces
129  Element year = (Element) XPath.selectSingleNode(communication, "descendant::Description/Key[contains(@Name,'Date of recording')]");
130  System.out.println(year.getText());
131  if (!allDistinctYears.contains(year.getText())) {
132  allDistinctYears.add(year.getText());
133  }
134  Element descriptiondesc = (Element) XPath.selectSingleNode(communication, "descendant::Description/Key[contains(@Name,'Title')]");
135 
136  Element genre = (Element) XPath.selectSingleNode(communication, "descendant::Description/Key[contains(@Name,'Genre')]");
137  System.out.println(genre.getText());
138  Element settlement = (Element) XPath.selectSingleNode(communication, "descendant::Location/Description/Key[contains(@Name,'Settlement')]");
139  if(settlement==null){
140  settlement = new Element("Settlement");
141  }
142  System.out.println(settlement.getText());
143  Element speaker = (Element) XPath.selectSingleNode(communication, "descendant::Description/Key[contains(@Name,'Speakers')]");
144  System.out.println(speaker.getText());
145  String keywords = "\"";
146  if (descriptiondesc != null) {
147  System.out.println(descriptiondesc.getText());
148 
149  for (String s : descriptiondesc.getText().split(" ")) {
150  if (!keywordblacklist.contains(s.toLowerCase())) {
151  keywords += s + ",";
152  }
153  }
154  }
155  keywords += year.getText() + "," + genre.getText() + "," + settlement.getText() + "," + speaker.getText() + "\"";
156  comrow[1] = keywords;
157  //year - Description Date of Recording
158  comrow[2] = cleanForCSV(year.getText());
159  //dialect
160  Element dialect = (Element) XPath.selectSingleNode(communication, "descendant::Description/Key[contains(@Name,'Dialect')]");
161  System.out.println(dialect.getText());
162  comrow[3] = cleanForCSV(dialect.getText());
163  //country
164  Element country = (Element) XPath.selectSingleNode(communication, "descendant::Location/Description/Key[contains(@Name,'Country')]");
165  if(country==null){
166  country = new Element("Country");
167  }
168  System.out.println(country.getText());
169  comrow[4] = cleanForCSV(country.getText());
170  //region
171  Element region = (Element) XPath.selectSingleNode(communication, "descendant::Location/Description/Key[contains(@Name,'Region')]");
172  if(region==null){
173  region = new Element("Region");
174  }
175  System.out.println(region.getText());
176  comrow[5] = cleanForCSV(region.getText());
177  //settlement
178  comrow[6] = cleanForCSV(settlement.getText());
179  //language
180  Element language = (Element) XPath.selectSingleNode(communication, "descendant::Language/LanguageCode");
181  System.out.println(language.getText());
182  comrow[7] = cleanForCSV(language.getText());
183  //speaker
184  comrow[8] = "\"" + speaker.getText() + "\"";
185  //transcription url
186  //needs to look like https://corpora.uni-hamburg.de/repository/transcript:selkup-1.0_DN_196X_Bread_nar/EXB/DN_196X_Bread_nar.exb
187  String transcrurl = transrepourl + id.getValue() + "/EXB/" + id.getValue() + ".exb";
188  //Element transcription = (Element) XPath.selectSingleNode(communication, "descendant::Transcription/NSLink");
189  //System.out.println(transcription.getText());
190  //comrow[8] = transcription.getText();
191  comrow[9] = transcrurl;
192  //scorehtml url
193  //needs to look like
194  //https://corpora.uni-hamburg.de/repository/transcript:selkup-1.0_AGS_1964_SnakeInMouth_flk/SCORE/AGS_1964_SnakeInMouth_flk-score.html
195  String scoreurl = transrepourl + id.getValue() + "/SCORE/" + id.getValue() + "-score.html";
196  comrow[10] = scoreurl;
197  //listhtml url
198  //needs to look like
199  //https://corpora.uni-hamburg.de/repository/transcript:selkup-1.0_AGS_1964_SnakeInMouth_flk/LIST/AGS_1964_SnakeInMouth_flk-list.html
200  String listurl = transrepourl + id.getValue() + "/LIST/" + id.getValue() + "-list.html";
201  comrow[11] = listurl;
202  //pdf url
203  Element pdf = (Element) XPath.selectSingleNode(communication, "descendant::File[mimetype='application/pdf']/relPath']");
204  //audio url
205  Element audio = (Element) XPath.selectSingleNode(communication, "descendant::Recording/Media/NSLink");
206  //check for cases with no audio and no pdf or both!
207  String pdfrurl = filerepourl + id.getValue() + "/PDF/" + id.getValue() + ".pdf";
208  String audiourl = recrepourl + id.getValue() + "/MP3/" + id.getValue() + ".mp3";
209  Element transcription = (Element) XPath.selectSingleNode(communication, "descendant::Transcription/NSLink");
210  URL imageLocation = null;
211  if (transcription != null) {
212  imageLocation = new URL(cd.getParentURL() + transcription.getText().replaceFirst("[.][^.]+$", "") + ".jpg");
213  } else {
214  stats.addCritical(function, cd, id.getValue() + ": No transcription linked in communication in the coma file!");
215  }
216  if (pdf == null && audio == null) {
217  comrow[12] = "np pdf";
218  comrow[13] = "no audio";
219  stats.addCritical(function, cd, id.getValue() + ": No audio or pdf linked in communication in the coma file!");
220  } else if (pdf != null && audio != null) {
221  //we have both - add both links but don't add an audio image
222  comrow[12] = pdfrurl;
223  comrow[13] = audiourl;
224  stats.addCritical(function, cd, id.getValue() + ": Audio AND pdf linked in communication in the coma file!");
225  } else if (pdf != null) {
226  comrow[12] = pdfrurl;
227  comrow[13] = "no audio";
228  //TODO
229  //now we need an image jpeg of the first page
230  //cio.copyInternalBinaryFile(PDF_IMAGE_PATH, imageLocation);
231  } else {
232  comrow[12] = "no pdf";
233  comrow[13] = audiourl;
234  //now save the audio image in the folder with the correct name
235  if (imageLocation != null) {
236  cio.copyInternalBinaryFile(AUDIO_IMAGE_PATH, imageLocation);
237  }
238  }
239  //genre
240  System.out.println(genre.getText());
241  comrow[14] = genre.getText();
242  //description
243  if (descriptiondesc != null) {
244  String descdesc = "\"" + descriptiondesc.getText() + "\"";
245  comrow[15] = descdesc;
246  }
247  data.add(comrow);
248  }
249  String newdata = "";
250  for (String[] row : data) {
251  newdata += String.join(",", row) + "\n";
252  //first row = keys
253  //other rows = values
254  }
255  //now save the string array as csv
256  URL configJSONlocation = new URL(vikusviewerurl + "/data.csv");
257  cio.write(newdata, configJSONlocation);
258  stats.addCorrect(function, cd, "vikus-viewer config successfully created at " + configJSONlocation.toString());
259  return stats;
260  }
261 
262  public Report createConfigJSON(CorpusData cd) throws JDOMException, IOException {
263  Report stats = new Report();
264  CorpusIO cio = new CorpusIO();
265  String config = cio.readInternalResourceAsString(CONFIG_PATH);
266  JsonElement jelement = new JsonParser().parse(config);
267  JsonObject jobject = jelement.getAsJsonObject();
268  jobject = jobject.getAsJsonObject("project");
269  jobject.addProperty("name", title + " " + version);
270  Gson gson = new GsonBuilder().setPrettyPrinting().create();
271  String prettyJsonString = gson.toJson(jelement);
272  //System.out.println(prettyJsonString);
273  //now save it pretty printed
274  URL configJSONlocation = new URL(vikusviewerurl + "/config.json");
275  cio.write(prettyJsonString, configJSONlocation);
276  stats.addCorrect(function, cd, "vikus-viewer config successfully created at " + configJSONlocation.toString());
277  return stats;
278  }
279 
280  public Report createInfoMD(CorpusData cd) throws JDOMException, IOException {
281  Report stats = new Report();
282  CorpusIO cio = new CorpusIO();
283  String info = cio.readInternalResourceAsString(INFO_PATH);
284  String corpusnameandversion = title + " " + version;
285  info = info.replaceAll("_CORPUSNAME_", corpusnameandversion);
286  //_DESCRIPTION_ <Key Name="DC:description">
287  info = info.replaceAll("_DESCRIPTION_", description);
288  //_LICENCE_ <Key Name="DC:rights">CC BY-NC-SA 4.0</Key>
289  info = info.replaceAll("_LICENCE_", licence);
290  //now save the string array as csv
291  URL infoMDlocation = new URL(vikusviewerurl + "/info.md");
292  cio.write(info, infoMDlocation);
293  stats.addCorrect(function, cd, "vikus-viewer info.md successfully created at " + infoMDlocation.toString());
294  return stats;
295  }
296 
297  public Report createTimelineCSV(CorpusData cd) throws FileNotFoundException, IOException, JDOMException {
298  //year,titel,text,extra,link,kategorie
299  //1864,Early work,"Vincent begins drawing his surroundings early, at the age of 11 here.","The family van Gogh lives in the small town Zundert in the South of the Netherlands. Vincent later visits a middle school in Tilburg, where he lives far from his family. Despite his good grades, he leaves school in 1868, aged 15. From now on, he works for the international art firm Goupil & Cie.",,
300  Report stats = new Report();
301  CSVReader reader;
302  CorpusIO cio = new CorpusIO();
303  reader = new CSVReader(new InputStreamReader(getClass().getResourceAsStream(TIMELINE_PATH)), ',');
304  Collections.sort(allDistinctYears);
305  List<String[]> time = reader.readAll();
306  for (String year : allDistinctYears) {
307  String[] timerow = new String[6];
308  timerow[0] = year;
309  timerow[1] = "";
310  timerow[2] = "";
311  timerow[3] = "";
312  timerow[4] = "";
313  timerow[5] = "";
314  time.add(timerow);
315  }
316  String newtime = "";
317  for (String[] row : time) {
318  newtime += String.join(",", row) + "\n";
319  }
320  //now save the string array as csv
321  URL timelineCSVlocation = new URL(vikusviewerurl + "/timeline.csv");
322  cio.write(newtime, timelineCSVlocation);
323  stats.addCorrect(function, cd, "vikus-viewer config successfully created at " + timelineCSVlocation.toString());
324  return stats;
325  }
326 
327  @Override
328  public Report function(Corpus c) throws NoSuchAlgorithmException, ClassNotFoundException, FSMException, URISyntaxException, SAXException, IOException, ParserConfigurationException, JexmaraldaException, TransformerException, XPathExpressionException, JDOMException {
329  Report stats;
330  cd = c.getComaData();
331  stats = function(cd);
332  return stats;
333  }
334 
335  @Override
336  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
337  try {
338  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
339  IsUsableFor.add(cl);
340  } catch (ClassNotFoundException ex) {
341  report.addException(ex, "Usable class not found.");
342  }
343  return IsUsableFor;
344  }
345 
346  public String cleanForCSV(String s){
347  s = s.replace(',', ' ');
348  s = s.replace('"', ' ');
349  s = s.replace('\'', ' ');
350  return s;
351  }
352 
353  @Override
354  public String getDescription() {
355  String description = "This class creates an config files needed "
356  + "for the vikus-viewer software. ";
357  return description;
358  }
359 
360 }
void merge(Report sr)
Definition: Report.java:73
void addCritical(String description)
Definition: Report.java:104
String readInternalResourceAsString(String path2resource)
Definition: CorpusIO.java:192
List< Element > getCommunications()
Definition: ComaData.java:285
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addCorrect(String statId, String description)
Definition: Report.java:217
void addException(Throwable e, String description)
Definition: Report.java:287
void copyInternalBinaryFile(String internalPath, URL url)
Definition: CorpusIO.java:262
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66