corpus-services  1.0
PrettyPrintData.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.validation;
7 
15 import java.io.ByteArrayInputStream;
16 import java.io.IOException;
17 import java.io.StringWriter;
18 import java.io.UnsupportedEncodingException;
19 import java.net.URISyntaxException;
20 import java.util.Collection;
21 import javax.xml.parsers.DocumentBuilderFactory;
22 import javax.xml.parsers.ParserConfigurationException;
23 import javax.xml.transform.OutputKeys;
24 import javax.xml.transform.Transformer;
25 import javax.xml.transform.TransformerException;
26 import javax.xml.transform.TransformerFactory;
27 import javax.xml.transform.dom.DOMSource;
28 import javax.xml.transform.stream.StreamResult;
29 import javax.xml.xpath.XPath;
30 import javax.xml.xpath.XPathConstants;
31 import javax.xml.xpath.XPathExpressionException;
32 import javax.xml.xpath.XPathFactory;
33 import org.exmaralda.partitureditor.jexmaralda.JexmaraldaException;
34 import org.jdom.JDOMException;
35 import org.w3c.dom.Document;
36 import org.w3c.dom.DocumentType;
37 import org.w3c.dom.Node;
38 import org.w3c.dom.NodeList;
39 import org.xml.sax.InputSource;
40 import org.xml.sax.SAXException;
41 
49 public class PrettyPrintData extends Checker implements CorpusFunction {
50 
51  String prettyCorpusData = "";
52 
53  public PrettyPrintData() {
54  //fixing is possible
55  super(true);
56  }
57 
58  public Report function(CorpusData cd, Boolean fix) throws IOException, TransformerException, ParserConfigurationException, SAXException, XPathExpressionException {
59  // if no diff - all fine, nothing needs to be done
60  if (CorpusDataIsAlreadyPretty(cd)) {
61  report.addCorrect(function, cd, "Already pretty printed.");
62  } // if difference then - needs to be pretty printed
63  else if (fix) {
64  if (cd.toUnformattedString() == null) {
65  report.addCritical(function, cd, "Could not create the unformatted String!");
66  } else {
67  //save it instead of the old file
68  CorpusIO cio = new CorpusIO();
69  cio.write(prettyCorpusData, cd.getURL());
70  cd.updateUnformattedString(prettyCorpusData);
71  report.addFix(function, cd, "CorpusData was pretty printed and saved.");
72 
73  }
74  } else {
75  report.addCritical(function, cd, "Needs to be pretty printed.");
76  }
77 
78  return report;
79  }
80 
81  @Override
82  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
83  try {
84  Class cl = Class.forName("de.uni_hamburg.corpora.BasicTranscriptionData");
85  IsUsableFor.add(cl);
86  Class cl2 = Class.forName("de.uni_hamburg.corpora.UnspecifiedXMLData");
87  IsUsableFor.add(cl2);
88  Class cl3 = Class.forName("de.uni_hamburg.corpora.ComaData");
89  IsUsableFor.add(cl3);
90  Class cl4 = Class.forName("de.uni_hamburg.corpora.SegmentedTranscriptionData");
91  IsUsableFor.add(cl4);
92  } catch (ClassNotFoundException ex) {
93  report.addException(ex, "Usable class not found.");
94  }
95  return IsUsableFor;
96  }
97 
98  public boolean CorpusDataIsAlreadyPretty(CorpusData cd) throws TransformerException, ParserConfigurationException, SAXException, IOException, XPathExpressionException, UnsupportedEncodingException {
99  //take the data, change datatosaveable string, method indent() in utilities\PrettyPrinter.java
100  //this one works for BasicTranscriptions only (keeping events togehter), but doesn't harm others
101  //need to have another string not intended depending on which
102  //file is the input
103 
104  if (cd.toUnformattedString() != null) {
105  if (cd instanceof UnspecifiedXMLData) {
106  prettyCorpusData = toPrettyString(cd.toUnformattedString(), 2);
107  } else {
108  PrettyPrinter pp = new PrettyPrinter();
109  prettyCorpusData = pp.indent(cd.toUnformattedString(), "event");
110  }
111  return cd.toUnformattedString().equals(prettyCorpusData);
112  } else {
113  return false;
114  }
115  //compare the files
116  // if no diff - all fine, nothing needs to be done
117  //TODO error - to saveableString already pretty printed - need to change that
118 
119  }
120 
125  @Override
126  public String getDescription() {
127  String description = "This class takes XML corpusdata and formats it in the same way to avoid merge conflicts. ";
128  return description;
129  }
130 
131  // corpied from https://stackoverflow.com/questions/25864316/pretty-print-xml-in-java-8/33541820#33541820
132  public static String toPrettyString(String xml, int indent) {
133  try {
134  // Turn xml string into a document
135  Document document = DocumentBuilderFactory.newInstance()
136  .newDocumentBuilder()
137  .parse(new InputSource(new ByteArrayInputStream(xml.getBytes("utf-8"))));
138 
139  // Remove whitespaces outside tags
140  document.normalize();
141  XPath xPath = XPathFactory.newInstance().newXPath();
142  NodeList nodeList = (NodeList) xPath.evaluate("//text()[normalize-space()='']",
143  document,
144  XPathConstants.NODESET);
145 
146  for (int i = 0; i < nodeList.getLength(); ++i) {
147  Node node = nodeList.item(i);
148  node.getParentNode().removeChild(node);
149  }
150 
151  // Setup pretty print options
152  TransformerFactory transformerFactory = TransformerFactory.newInstance();
153  //transformerFactory.setAttribute("indent-number", indent);
154  Transformer transformer = transformerFactory.newTransformer();
155  transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
156  transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no");
157  transformer.setOutputProperty(OutputKeys.INDENT, "yes");
158  //keep the doctype info
159  // http://www.srccodes.com/p/article/13/how-to-retain-doctype-declaration-while-saving-dom-document-to-an-xml-file
160  DocumentType doctype = document.getDoctype();
161  System.out.println(doctype);
162  if (doctype != null && doctype.getSystemId() != null) {
163  transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId());
164  }
165  if (doctype != null && doctype.getPublicId() != null) {
166  transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId());
167  }
168  // Return pretty print xml string
169  StringWriter stringWriter = new StringWriter();
170  transformer.transform(new DOMSource(document), new StreamResult(stringWriter));
171  return stringWriter.toString();
172  } catch (Exception e) {
173  throw new RuntimeException(e);
174  }
175  }
176 
177  @Override
178  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException, URISyntaxException, JDOMException, TransformerException, XPathExpressionException, JexmaraldaException {
179  Report stats = new Report();
180  for (CorpusData cdata : c.getCorpusData()) {
181  stats.merge(function(cdata, fix));
182  }
183  return stats;
184  }
185 }
void merge(Report sr)
Definition: Report.java:73
String indent(String xml, String suppressedElements)
void addCritical(String description)
Definition: Report.java:104
Collection< Class<?extends CorpusData > > getIsUsableFor()
static String toPrettyString(String xml, int indent)
void addCorrect(String statId, String description)
Definition: Report.java:217
void addException(Throwable e, String description)
Definition: Report.java:287
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:66
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155