1 package de.uni_hamburg.corpora;
6 import java.io.FileNotFoundException;
7 import java.io.FileOutputStream;
8 import java.io.IOException;
9 import java.io.InputStream;
10 import java.io.OutputStream;
11 import java.io.UnsupportedEncodingException;
12 import static java.lang.System.out;
13 import java.net.URISyntaxException;
15 import java.nio.file.DirectoryStream;
16 import java.nio.file.Files;
17 import java.nio.file.Path;
18 import java.nio.file.Paths;
19 import java.util.ArrayList;
20 import java.util.Calendar;
21 import java.util.Collection;
22 import java.util.TimeZone;
23 import javax.xml.parsers.ParserConfigurationException;
24 import javax.xml.transform.TransformerException;
25 import javax.xml.xpath.XPathExpressionException;
26 import org.apache.commons.io.IOUtils;
27 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
28 import org.jdom.Document;
29 import org.jdom.JDOMException;
30 import org.jdom.output.XMLOutputter;
31 import org.xml.sax.SAXException;
42 Collection<CorpusData> cdc =
new ArrayList();
43 Collection<URL> recursed =
new ArrayList();
44 Collection<URL> alldata =
new ArrayList();
45 Collection<Class<? extends CorpusData>> allCorpusDataTypes =
new ArrayList();
48 AnnotationSpecification asp =
new AnnotationSpecification();
54 allCorpusDataTypes.add(bt.getClass());
55 allCorpusDataTypes.add(coma.getClass());
56 allCorpusDataTypes.add(asp.getClass());
57 allCorpusDataTypes.add(cmdidata.getClass());
58 allCorpusDataTypes.add(usdata.getClass());
59 allCorpusDataTypes.add(segdata.getClass());
62 public String
CorpusData2String(
CorpusData cd)
throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
63 return cd.toSaveableString();
66 public void write(
CorpusData cd, URL url)
throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
67 write(cd.toSaveableString(), cd.getURL());
71 public void write(String s, URL url)
throws FileNotFoundException, IOException {
73 System.out.println(
"started writing document...");
74 outappend(
"============================\n");
75 FileOutputStream fos =
new FileOutputStream(
new File(url.getFile()));
76 fos.write(s.getBytes((
"UTF-8")));
78 System.out.println(
"Document written...");
81 public void write(Document doc, URL url)
throws IOException, TransformerException, ParserConfigurationException, ParserConfigurationException, UnsupportedEncodingException, UnsupportedEncodingException, SAXException, XPathExpressionException {
82 XMLOutputter xmOut =
new XMLOutputter();
83 String unformattedCorpusData = xmOut.outputString(doc);
85 String prettyCorpusData = pp.
indent(unformattedCorpusData,
"event");
86 write(prettyCorpusData, url);
89 public void write(org.w3c.dom.Document doc, URL url)
throws IOException, TransformerException, ParserConfigurationException, ParserConfigurationException, UnsupportedEncodingException, UnsupportedEncodingException, SAXException, XPathExpressionException {
92 String prettyCorpusData = pp.
indent(unformattedCorpusData,
"event");
93 write(prettyCorpusData, url);
97 Calendar cal = Calendar.getInstance(TimeZone.getDefault());
98 java.text.SimpleDateFormat sdf =
new java.text.SimpleDateFormat(
"dd-MM-yyyy HH:mm:ss");
99 String time = sdf.format(cal.getTime());
100 out.append(
"[" + time +
"] ");
104 public void write(Collection<CorpusData> cdc, URL url) {
125 public CorpusData readFileURL(URL url, Collection<Class<? extends CorpusData>> clcds) throws SAXException, JexmaraldaException, ClassNotFoundException {
127 if (
new File(url.getFile()).isFile()) {
128 if (url.getPath().endsWith(
"exb") && clcds.contains(bt.getClass())) {
132 }
else if (url.getPath().toLowerCase().endsWith(
"coma") && clcds.contains(coma.getClass())) {
136 }
else if (url.getPath().toLowerCase().endsWith(
"xml") && ((url.getPath().toLowerCase().contains(
"Annotation"))) && clcds.contains(asp.getClass())) {
137 AnnotationSpecification as =
new AnnotationSpecification(url);
138 System.out.println(as.getFilename() +
" read");
140 }
else if ((url.getPath().toLowerCase().endsWith(
"xml") && url.getPath().toLowerCase().contains(
"cmdi")) && clcds.contains(cmdidata.getClass()) || url.getPath().toLowerCase().endsWith(
"cmdi") && clcds.contains(cmdidata.getClass())) {
144 }
else if (url.getPath().toLowerCase().endsWith(
"xml") && clcds.contains(usdata.getClass())) {
148 }
else if (url.getPath().toLowerCase().endsWith(
"exs") && clcds.contains(segdata.getClass())) {
153 System.out.println(url +
" will not be read");
157 System.out.println(
"Critical: " + url +
" cannot be read");
168 public Collection<CorpusData>
read(URL url)
throws URISyntaxException, IOException, SAXException, JexmaraldaException, ClassNotFoundException {
170 for (URL readurl : alldata) {
172 if (cdread != null && !cdc.contains(cdread)) {
180 public Collection<CorpusData>
read(URL url, Collection<Class<? extends CorpusData>> chosencdc) throws URISyntaxException, IOException, SAXException, JexmaraldaException, ClassNotFoundException {
183 for (URL readurl : alldata) {
185 if (cdread != null && !cdc.contains(cdread)) {
194 System.out.println(path2resource);
195 if (xslstring == null) {
196 throw new IOException(
"Stylesheet not found!");
202 String xslstring =
new String(Files.readAllBytes(Paths.get(
new URL(path2resource).toURI())));
203 System.out.println(path2resource);
204 if (xslstring == null) {
205 throw new IOException(
"File not found!");
210 public Collection<URL>
URLtoList(URL url)
throws URISyntaxException, IOException {
216 Path path = Paths.get(url.toURI());
218 for (URL urlread : recursed) {
220 alldata.add(urlread);
241 String scheme = url.getProtocol();
242 return "file".equalsIgnoreCase(scheme) && !
hasHost(url);
248 public static boolean isDirectory(java.net.URL url) throws URISyntaxException {
250 return Files.isDirectory(Paths.get(url.toURI()));
253 public static boolean hasHost(java.net.URL url) {
254 String host = url.getHost();
255 return host != null && !
"".equals(host);
258 public void writePrettyPrinted(
CorpusData cd, URL url)
throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException {
259 write(cd.toSaveableString(), cd.getURL());
263 InputStream in = getClass().getResourceAsStream(internalPath);
264 OutputStream out =
new FileOutputStream(
new File(url.getFile()));
265 IOUtils.copy(in, out);
268 void listFiles(Path path)
throws IOException {
269 try (DirectoryStream<Path> stream = Files.newDirectoryStream(path)) {
270 for (Path entry : stream) {
271 if (Files.isDirectory(entry)) {
274 String sentry = entry.getFileName().toString().toLowerCase();
275 if (sentry.endsWith(
".exb") || sentry.endsWith(
".exs") || sentry.endsWith(
".coma") || sentry.endsWith(
".xml") || sentry.endsWith(
".cmdi") || sentry.endsWith(
".eaf") || sentry.endsWith(
".flextext") || sentry.endsWith(
".esa") || sentry.endsWith(
".tei") || sentry.endsWith(
".xsl")) {
276 recursed.add(entry.toUri().toURL());
void write(Collection< CorpusData > cdc, URL url)
void write(org.w3c.dom.Document doc, URL url)
void write(String s, URL url)
void write(Document doc, URL url)
static boolean isLocalFile(java.net.URL url)
CorpusData readFileURL(URL url, Collection< Class<?extends CorpusData >> clcds)
CorpusData readFileURL(URL url)
String indent(String xml, String suppressedElements)
Collection< URL > URLtoList(URL url)
void writePrettyPrinted(CorpusData cd, URL url)
String readExternalResourceAsString(String path2resource)
static boolean isDirectory(java.net.URL url)
String readInternalResourceAsString(String path2resource)
static String InputStream2String(InputStream is)
static String W3cDocument2String(org.w3c.dom.Document doc)
Collection< CorpusData > read(URL url)
static boolean hasHost(java.net.URL url)
void copyInternalBinaryFile(String internalPath, URL url)
String CorpusData2String(CorpusData cd)
void write(CorpusData cd, URL url)
Collection< CorpusData > read(URL url, Collection< Class<?extends CorpusData >> chosencdc)