corpus-services  1.0
CorpusIO.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora;
2 
5 import java.io.File;
6 import java.io.FileNotFoundException;
7 import java.io.FileOutputStream;
8 import java.io.IOException;
9 import java.io.InputStream;
10 import java.io.OutputStream;
11 import java.io.UnsupportedEncodingException;
12 import static java.lang.System.out;
13 import java.net.URISyntaxException;
14 import java.net.URL;
15 import java.nio.file.DirectoryStream;
16 import java.nio.file.Files;
17 import java.nio.file.Path;
18 import java.nio.file.Paths;
19 import java.util.ArrayList;
20 import java.util.Calendar;
21 import java.util.Collection;
22 import java.util.TimeZone;
23 import javax.xml.parsers.ParserConfigurationException;
24 import javax.xml.transform.TransformerException;
25 import javax.xml.xpath.XPathExpressionException;
26 import org.apache.commons.io.IOUtils;
27 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
28 import org.jdom.Document;
29 import org.jdom.JDOMException;
30 import org.jdom.output.XMLOutputter;
31 import org.xml.sax.SAXException;
32 
38 public class CorpusIO {
39 
40  //that's the local filepath or repository url
41  URL url;
42  Collection<CorpusData> cdc = new ArrayList();
43  Collection<URL> recursed = new ArrayList();
44  Collection<URL> alldata = new ArrayList();
45  Collection<Class<? extends CorpusData>> allCorpusDataTypes = new ArrayList();
47  ComaData coma = new ComaData();
48  AnnotationSpecification asp = new AnnotationSpecification();
49  CmdiData cmdidata = new CmdiData();
52 
53  public CorpusIO() {
54  allCorpusDataTypes.add(bt.getClass());
55  allCorpusDataTypes.add(coma.getClass());
56  allCorpusDataTypes.add(asp.getClass());
57  allCorpusDataTypes.add(cmdidata.getClass());
58  allCorpusDataTypes.add(usdata.getClass());
59  allCorpusDataTypes.add(segdata.getClass());
60  }
61 
62  public String CorpusData2String(CorpusData cd) throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
63  return cd.toSaveableString();
64  }
65 
66  public void write(CorpusData cd, URL url) throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
67  write(cd.toSaveableString(), cd.getURL());
68  }
69 
70  //TODO
71  public void write(String s, URL url) throws FileNotFoundException, IOException {
72  //If URL is on fileserver only...
73  System.out.println("started writing document...");
74  outappend("============================\n");
75  FileOutputStream fos = new FileOutputStream(new File(url.getFile()));
76  fos.write(s.getBytes(("UTF-8")));
77  fos.close();
78  System.out.println("Document written...");
79  }
80 
81  public void write(Document doc, URL url) throws IOException, TransformerException, ParserConfigurationException, ParserConfigurationException, UnsupportedEncodingException, UnsupportedEncodingException, SAXException, XPathExpressionException {
82  XMLOutputter xmOut = new XMLOutputter();
83  String unformattedCorpusData = xmOut.outputString(doc);
84  PrettyPrinter pp = new PrettyPrinter();
85  String prettyCorpusData = pp.indent(unformattedCorpusData, "event");
86  write(prettyCorpusData, url);
87  }
88 
89  public void write(org.w3c.dom.Document doc, URL url) throws IOException, TransformerException, ParserConfigurationException, ParserConfigurationException, UnsupportedEncodingException, UnsupportedEncodingException, SAXException, XPathExpressionException {
90  String unformattedCorpusData = TypeConverter.W3cDocument2String(doc);
91  PrettyPrinter pp = new PrettyPrinter();
92  String prettyCorpusData = pp.indent(unformattedCorpusData, "event");
93  write(prettyCorpusData, url);
94  }
95 
96  public void outappend(String a) {
97  Calendar cal = Calendar.getInstance(TimeZone.getDefault());
98  java.text.SimpleDateFormat sdf = new java.text.SimpleDateFormat("dd-MM-yyyy HH:mm:ss");
99  String time = sdf.format(cal.getTime());
100  out.append("[" + time + "] ");
101  out.append(a);
102  }
103 
104  public void write(Collection<CorpusData> cdc, URL url) {
105  //TODO
106  }
107 
108  /*
109  * The following methods need to be in the Iterators for Coma and CMDI that don't exist yet
110  *
111 
112  public abstract Collection getAllTranscripts();
113 
114  public abstract Collection getAllAudioFiles();
115 
116  public abstract Collection getAllVideoFiles();
117 
118  public abstract String getAudioLinkForTranscript();
119 
120  public abstract String getVideoLinkForTranscript();
121 
122  */
123  //read a single file as a corpus data object from an url
124  //only read it if it is needed
125  public CorpusData readFileURL(URL url, Collection<Class<? extends CorpusData>> clcds) throws SAXException, JexmaraldaException, ClassNotFoundException {
126  CorpusData cd = null;
127  if (new File(url.getFile()).isFile()) {
128  if (url.getPath().endsWith("exb") && clcds.contains(bt.getClass())) {
130  System.out.println(btd.getFilename() + " read");
131  return btd;
132  } else if (url.getPath().toLowerCase().endsWith("coma") && clcds.contains(coma.getClass())) {
133  ComaData cm = new ComaData(url);
134  System.out.println(cm.getFilename() + " read");
135  return cm;
136  } else if (url.getPath().toLowerCase().endsWith("xml") && ((url.getPath().toLowerCase().contains("Annotation"))) && clcds.contains(asp.getClass())) {
137  AnnotationSpecification as = new AnnotationSpecification(url);
138  System.out.println(as.getFilename() + " read");
139  return as;
140  } else if ((url.getPath().toLowerCase().endsWith("xml") && url.getPath().toLowerCase().contains("cmdi")) && clcds.contains(cmdidata.getClass()) || url.getPath().toLowerCase().endsWith("cmdi") && clcds.contains(cmdidata.getClass())) {
141  CmdiData cmdi = new CmdiData(url);
142  System.out.println(cmdi.getFilename() + " read");
143  return cmdi;
144  } else if (url.getPath().toLowerCase().endsWith("xml") && clcds.contains(usdata.getClass())) {
145  UnspecifiedXMLData usd = new UnspecifiedXMLData(url);
146  System.out.println(usd.getFilename() + " read");
147  return usd;
148  } else if (url.getPath().toLowerCase().endsWith("exs") && clcds.contains(segdata.getClass())) {
150  System.out.println(seg.getFilename() + " read");
151  return seg;
152  } else {
153  System.out.println(url + " will not be read");
154  return null;
155  }
156  } else {
157  System.out.println("Critical: " + url + " cannot be read");
158  return null;
159  }
160  }
161 
162  //read a single file as a corpus data object from an url
163  public CorpusData readFileURL(URL url) throws SAXException, JexmaraldaException, ClassNotFoundException {
164  return readFileURL(url, allCorpusDataTypes);
165  }
166 
167  //read all the files as corpus data objects from a directory url
168  public Collection<CorpusData> read(URL url) throws URISyntaxException, IOException, SAXException, JexmaraldaException, ClassNotFoundException {
169  alldata = URLtoList(url);
170  for (URL readurl : alldata) {
171  CorpusData cdread = readFileURL(readurl);
172  if (cdread != null && !cdc.contains(cdread)) {
173  cdc.add(cdread);
174  }
175  }
176  return cdc;
177  }
178 
179  //read only the files as corpus data objects from a directory url that are specified in the Collection
180  public Collection<CorpusData> read(URL url, Collection<Class<? extends CorpusData>> chosencdc) throws URISyntaxException, IOException, SAXException, JexmaraldaException, ClassNotFoundException {
181  //To do
182  alldata = URLtoList(url);
183  for (URL readurl : alldata) {
184  CorpusData cdread = readFileURL(readurl);
185  if (cdread != null && !cdc.contains(cdread)) {
186  cdc.add(cdread);
187  }
188  }
189  return cdc;
190  }
191 
192  public String readInternalResourceAsString(String path2resource) throws JDOMException, IOException {
193  String xslstring = TypeConverter.InputStream2String(getClass().getResourceAsStream(path2resource));
194  System.out.println(path2resource);
195  if (xslstring == null) {
196  throw new IOException("Stylesheet not found!");
197  }
198  return xslstring;
199  }
200 
201  public String readExternalResourceAsString(String path2resource) throws JDOMException, IOException, URISyntaxException {
202  String xslstring = new String(Files.readAllBytes(Paths.get(new URL(path2resource).toURI())));
203  System.out.println(path2resource);
204  if (xslstring == null) {
205  throw new IOException("File not found!");
206  }
207  return xslstring;
208  }
209 
210  public Collection<URL> URLtoList(URL url) throws URISyntaxException, IOException {
211  if (isLocalFile(url)) {
212  //if the url points to a directory
213  if (isDirectory(url)) {
214  //we need to iterate
215  //and add everything to the list
216  Path path = Paths.get(url.toURI());
217  listFiles(path);
218  for (URL urlread : recursed) {
219  if (!isDirectory(urlread)) {
220  alldata.add(urlread);
221  }
222  }
223  return alldata;
224  } //if the url points to a file
225  else {
226  //we need to add just this file
227  alldata.add(url);
228  return alldata;
229  }
230  } else {
231  //it's a datastream in the repo
232  //TODO later
233  return null;
234  }
235  }
236 
240  public static boolean isLocalFile(java.net.URL url) {
241  String scheme = url.getProtocol();
242  return "file".equalsIgnoreCase(scheme) && !hasHost(url);
243  }
244 
248  public static boolean isDirectory(java.net.URL url) throws URISyntaxException {
249  //return new File(url.toURI()).isDirectory();
250  return Files.isDirectory(Paths.get(url.toURI()));
251  }
252 
253  public static boolean hasHost(java.net.URL url) {
254  String host = url.getHost();
255  return host != null && !"".equals(host);
256  }
257 
258  public void writePrettyPrinted(CorpusData cd, URL url) throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
259  write(cd.toSaveableString(), cd.getURL());
260  }
261 
262  public void copyInternalBinaryFile(String internalPath, URL url) throws FileNotFoundException, IOException {
263  InputStream in = getClass().getResourceAsStream(internalPath);
264  OutputStream out = new FileOutputStream(new File(url.getFile()));
265  IOUtils.copy(in, out);
266  }
267 
268  void listFiles(Path path) throws IOException {
269  try (DirectoryStream<Path> stream = Files.newDirectoryStream(path)) {
270  for (Path entry : stream) {
271  if (Files.isDirectory(entry)) {
272  listFiles(entry);
273  }
274  String sentry = entry.getFileName().toString().toLowerCase();
275  if (sentry.endsWith(".exb") || sentry.endsWith(".exs") || sentry.endsWith(".coma") || sentry.endsWith(".xml") || sentry.endsWith(".cmdi") || sentry.endsWith(".eaf") || sentry.endsWith(".flextext") || sentry.endsWith(".esa") || sentry.endsWith(".tei") || sentry.endsWith(".xsl")) {
276  recursed.add(entry.toUri().toURL());
277  }
278  }
279  }
280  }
281 }
void write(Collection< CorpusData > cdc, URL url)
Definition: CorpusIO.java:104
void write(org.w3c.dom.Document doc, URL url)
Definition: CorpusIO.java:89
void write(String s, URL url)
Definition: CorpusIO.java:71
void write(Document doc, URL url)
Definition: CorpusIO.java:81
static boolean isLocalFile(java.net.URL url)
Definition: CorpusIO.java:240
CorpusData readFileURL(URL url, Collection< Class<?extends CorpusData >> clcds)
Definition: CorpusIO.java:125
CorpusData readFileURL(URL url)
Definition: CorpusIO.java:163
String indent(String xml, String suppressedElements)
Collection< URL > URLtoList(URL url)
Definition: CorpusIO.java:210
void writePrettyPrinted(CorpusData cd, URL url)
Definition: CorpusIO.java:258
String readExternalResourceAsString(String path2resource)
Definition: CorpusIO.java:201
static boolean isDirectory(java.net.URL url)
Definition: CorpusIO.java:248
String readInternalResourceAsString(String path2resource)
Definition: CorpusIO.java:192
static String InputStream2String(InputStream is)
static String W3cDocument2String(org.w3c.dom.Document doc)
Collection< CorpusData > read(URL url)
Definition: CorpusIO.java:168
static boolean hasHost(java.net.URL url)
Definition: CorpusIO.java:253
void copyInternalBinaryFile(String internalPath, URL url)
Definition: CorpusIO.java:262
String CorpusData2String(CorpusData cd)
Definition: CorpusIO.java:62
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
Collection< CorpusData > read(URL url, Collection< Class<?extends CorpusData >> chosencdc)
Definition: CorpusIO.java:180