corpus-services  1.0
ComaData.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora;
7 
8 import org.exmaralda.coma.root.Coma;
10 import java.io.IOException;
11 import java.net.MalformedURLException;
12 import java.net.URISyntaxException;
13 import java.net.URL;
14 import java.util.Collection;
15 import java.nio.file.Files;
16 import java.nio.file.Paths;
17 import java.util.logging.Level;
18 import java.util.logging.Logger;
19 import org.jdom.Document;
20 import org.jdom.JDOMException;
21 import org.jdom.input.SAXBuilder;
22 import org.xml.sax.SAXException;
23 import java.net.URI;
24 import java.util.ArrayList;
25 import java.util.List;
26 import javax.xml.parsers.ParserConfigurationException;
27 import javax.xml.transform.TransformerException;
28 import javax.xml.xpath.XPathExpressionException;
29 import org.jdom.Element;
30 import org.jdom.xpath.XPath;
31 import org.apache.commons.io.FilenameUtils;
32 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
33 
38 public class ComaData implements Metadata, CorpusData, XMLData {
39 
40  //TODO
41  private Coma coma;
42  //TODO change exceptions to adding ReportItems
43  URL url;
44  Document readcomaasjdom = new Document();
45  String originalstring;
46  String filename;
47  String filenamewithoutending;
48 
50 
51  public static String SEGMENTED_FILE_XPATH = "//Transcription[Description/Key[@Name='segmented']/text()='true']/NSLink";
52  public static String BASIC_FILE_XPATH = "//Transcription[Description/Key[@Name='segmented']/text()='false']/NSLink";
53  public static String ALL_FILE_XPATH = "//Transcription/NSLink";
54  public static String CORPUSNAME_XPATH = "//Description/Key[@Name='DC:title']";
55 
56  String corpusname;
57 
58  public ArrayList<URL> referencedCorpusDataURLs = new ArrayList<URL>();
59 
60  public ComaData() {
61  }
62 
63  public ComaData(URL url) throws SAXException, JexmaraldaException {
64  try {
65  this.url = url;
66  SAXBuilder builder = new SAXBuilder();
67  readcomaasjdom = builder.build(url);
68  originalstring = new String(Files.readAllBytes(Paths.get(url.toURI())), "UTF-8");
69  URI uri = url.toURI();
70  URI parentURI = uri.getPath().endsWith("/") ? uri.resolve("..") : uri.resolve(".");
71  CORPUS_BASEDIRECTORY = parentURI.toURL();
72  filename = FilenameUtils.getName(url.getPath());
73  filenamewithoutending = FilenameUtils.getBaseName(url.getPath());
74  } catch (JDOMException ex) {
75  Logger.getLogger(UnspecifiedXMLData.class.getName()).log(Level.SEVERE, null, ex);
76  } catch (IOException ex) {
77  Logger.getLogger(UnspecifiedXMLData.class.getName()).log(Level.SEVERE, null, ex);
78  } catch (URISyntaxException ex) {
79  Logger.getLogger(BasicTranscriptionData.class.getName()).log(Level.SEVERE, null, ex);
80  }
81  }
82 
83 
84  /*public void updateReadcomaasjdom() throws SAXException, JexmaraldaException, MalformedURLException, JDOMException, IOException {
85  String xmlString =
86  SAXBuilder builder = new SAXBuilder();
87  readcomaasjdom = builder.build(xmlString);
88  }*/
89  @Override
90  public URL getURL() {
91  return url;
92  }
93 
94  @Override
95  public String toSaveableString() throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
96  return toPrettyPrintedXML();
97  }
98 
99  private String toPrettyPrintedXML() throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
100  PrettyPrinter pp = new PrettyPrinter();
101  String prettyCorpusData = pp.indent(toUnformattedString(), "event");
102  //String prettyCorpusData = pp.indent(bt.toXML(bt.getTierFormatTable()), "event");
103  return prettyCorpusData;
104  }
105 
106  @Override
107  public String toUnformattedString() {
108  return originalstring;
109  }
110 
111  //TODO!
112  @Override
113  public Collection<URL> getReferencedCorpusDataURLs() throws MalformedURLException, URISyntaxException {
114  for (URL rurul : getAllURLs()) {
115  if (!referencedCorpusDataURLs.contains(rurul)) {
116  referencedCorpusDataURLs.add(rurul);
117  }
118  }
119 
120  //now read the NSLinks and add the URLs from the files
121  //we need to have different ArrayLists for exb, exs, audio, pdf
122  //TODO!
124  }
125 
126  public Collection<URL> getAllBasicTranscriptionURLs() throws MalformedURLException, URISyntaxException {
127  URL resulturl;
128  ArrayList<URL> resulturls = new ArrayList<>();
129  try {
130  XPath xpath = XPath.newInstance(BASIC_FILE_XPATH);
131  List transcriptionList = xpath.selectNodes(readcomaasjdom);
132  for (int pos = 0; pos < transcriptionList.size(); pos++) {
133  Element nslink = (Element) (transcriptionList.get(pos));
134  //String fullTranscriptionName = CORPUS_BASEDIRECTORY.toURI().getPath() + nslink.getText();
135  resulturl = new URL(CORPUS_BASEDIRECTORY + nslink.getText());
136  //Paths.get(fullTranscriptionName).toUri().toURL();
137  resulturls.add(resulturl);
138  }
139  } catch (JDOMException ex) {
140  ex.printStackTrace();
141  }
142  return resulturls;
143  }
144 
145  public ArrayList<String> getAllBasicTranscriptionFilenames() {
146  try {
147  ArrayList<String> result = new ArrayList<>();
148  XPath xpath = XPath.newInstance(BASIC_FILE_XPATH);
149  List transcriptionList = xpath.selectNodes(readcomaasjdom);
150  for (int pos = 0; pos < transcriptionList.size(); pos++) {
151  Element nslink = (Element) (transcriptionList.get(pos));
152  // currentElement = nslink;
153  // String fullTranscriptionName = CORPUS_BASEDIRECTORY + "\\" +
154  // nslink.getText();
155  result.add(nslink.getText());
156  //resulturl = Paths.get(nslink.getText()).toUri().toURL();
157  //resulturls.add(resulturl);
158  }
159  return result;
160  } catch (JDOMException ex) {
161  ex.printStackTrace();
162  }
163  return null;
164  }
165 
166  public Collection<URL> getAllSegmentedTranscriptionURLs() throws MalformedURLException, URISyntaxException {
167  URL resulturl;
168  ArrayList<URL> resulturls = new ArrayList<>();
169  try {
170  XPath xpath = XPath.newInstance(SEGMENTED_FILE_XPATH);
171  List transcriptionList = xpath.selectNodes(readcomaasjdom);
172  for (int pos = 0; pos < transcriptionList.size(); pos++) {
173  Element nslink = (Element) (transcriptionList.get(pos));
174  //String fullTranscriptionName = CORPUS_BASEDIRECTORY.toURI().getPath() + nslink.getText();
175  resulturl = new URL(CORPUS_BASEDIRECTORY + nslink.getText());
176  //Paths.get(fullTranscriptionName).toUri().toURL();
177  resulturls.add(resulturl);
178  }
179  } catch (JDOMException ex) {
180  ex.printStackTrace();
181  }
182  return resulturls;
183  }
184 
185  public Collection<URL> getAllURLs() throws MalformedURLException, URISyntaxException {
186  URL resulturl;
187  ArrayList<URL> resulturls = new ArrayList<>();
188  try {
189  XPath xpath = XPath.newInstance(ALL_FILE_XPATH);
190  List transcriptionList = xpath.selectNodes(readcomaasjdom);
191  for (int pos = 0; pos < transcriptionList.size(); pos++) {
192  Element nslink = (Element) (transcriptionList.get(pos));
193  //String fullTranscriptionName = CORPUS_BASEDIRECTORY.toURI().getPath() + nslink.getText();
194  resulturl = new URL(CORPUS_BASEDIRECTORY + nslink.getText());
195  //Paths.get(fullTranscriptionName).toUri().toURL();
196  if (!resulturls.contains(resulturl)) {
197  resulturls.add(resulturl);
198  }
199  }
200  } catch (JDOMException ex) {
201  ex.printStackTrace();
202  }
203  return resulturls;
204  }
205 
206  public void updateUnformattedString(String newUnformattedString) {
207  originalstring = newUnformattedString;
208  }
209 
210  public void setBaseDirectory(URL url) {
211  CORPUS_BASEDIRECTORY = url;
212  }
213 
214  public URL getBasedirectory() throws URISyntaxException, MalformedURLException {
215  URI uri = url.toURI();
216  URI parentURI = uri.getPath().endsWith("/") ? uri.resolve("..") : uri.resolve(".");
217  CORPUS_BASEDIRECTORY = parentURI.toURL();
218  return CORPUS_BASEDIRECTORY;
219  }
220 
221  @Override
222  public URL getParentURL() {
223  return CORPUS_BASEDIRECTORY;
224  }
225 
226  @Override
227  public void setURL(URL nurl) {
228  url = nurl;
229  }
230 
231  @Override
232  public void setParentURL(URL url) {
233  CORPUS_BASEDIRECTORY = url;
234  }
235 
236  @Override
237  public String getFilename() {
238  return filename;
239  }
240 
241  @Override
242  public void setFilename(String s) {
243  filename = s;
244  }
245 
246  @Override
248  return filenamewithoutending;
249  }
250 
251  @Override
252  public void setFilenameWithoutFileEnding(String s) {
253  filenamewithoutending = s;
254  }
255 
256  @Override
257  public Document getJdom() {
258  return readcomaasjdom;
259  }
260 
261  @Override
262  public void setJdom(Document jdom) {
263  readcomaasjdom = jdom;
264  }
265 
266  public Coma getEXMARaLDAComa() {
267  return coma;
268  }
269 
270  public void setOriginalString(String s) {
271  originalstring = s;
272  }
273 
274  public String getCorpusName() throws JDOMException {
275  XPath xpath = XPath.newInstance(CORPUSNAME_XPATH);
276  Element name = (Element) xpath.selectSingleNode(readcomaasjdom);
277  corpusname = name.getText();
278  return corpusname;
279  }
280 
281  public void setCorpusName(String s) {
282  corpusname = s;
283  }
284 
285  public List<Element> getCommunications() throws JDOMException{
286  return XPath.selectNodes(readcomaasjdom, "//Communication");
287  }
288 
289  public Element getCorpusDescription() throws JDOMException{
290  return (Element) XPath.selectSingleNode(readcomaasjdom, "/Corpus/Description");
291  }
292 
293 
294  public Element getCorpusData() throws JDOMException{
295  return (Element) XPath.selectSingleNode(readcomaasjdom, "/Corpus/CorpusData");
296  }
297 }
ArrayList< URL > referencedCorpusDataURLs
Definition: ComaData.java:58
Collection< URL > getReferencedCorpusDataURLs()
Definition: ComaData.java:113
void setJdom(Document jdom)
Definition: ComaData.java:262
void setOriginalString(String s)
Definition: ComaData.java:270
ArrayList< String > getAllBasicTranscriptionFilenames()
Definition: ComaData.java:145
Collection< URL > getAllSegmentedTranscriptionURLs()
Definition: ComaData.java:166
String indent(String xml, String suppressedElements)
void setFilenameWithoutFileEnding(String s)
Definition: ComaData.java:252
List< Element > getCommunications()
Definition: ComaData.java:285
Collection< URL > getAllBasicTranscriptionURLs()
Definition: ComaData.java:126
Collection< URL > getAllURLs()
Definition: ComaData.java:185
void updateUnformattedString(String newUnformattedString)
Definition: ComaData.java:206
static String SEGMENTED_FILE_XPATH
Definition: ComaData.java:51