corpus-services  1.0
PrettyPrinter.java
Go to the documentation of this file.
1 package de.uni_hamburg.corpora.utilities;
2 /*
3  * To change this license header, choose License Headers in Project Properties.
4  * To change this template file, choose Tools | Templates
5  * and open the template in the editor.
6  */
7 
8 
9 import java.io.ByteArrayInputStream;
10 import java.io.IOException;
11 import java.io.UnsupportedEncodingException;
12 import java.util.regex.Pattern;
13 import javax.xml.parsers.DocumentBuilderFactory;
14 import javax.xml.parsers.ParserConfigurationException;
15 import javax.xml.transform.OutputKeys;
16 import javax.xml.transform.TransformerException;
17 import javax.xml.xpath.XPath;
18 import javax.xml.xpath.XPathConstants;
19 import javax.xml.xpath.XPathExpressionException;
20 import javax.xml.xpath.XPathFactory;
21 import org.w3c.dom.Document;
22 import org.w3c.dom.Node;
23 import org.w3c.dom.NodeList;
24 import org.xml.sax.InputSource;
25 import org.xml.sax.SAXException;
26 
31 public class PrettyPrinter {
32 
33 
34  String xslLocation = "/xsl/pretty-print-sort-elements.xsl";
35 
36 
37  public PrettyPrinter(){
38  }
39 
40 
48  public String indent(String xml, String suppressedElements) throws TransformerException, ParserConfigurationException, UnsupportedEncodingException, SAXException, IOException, XPathExpressionException {
49 
50  String xslString = TypeConverter.InputStream2String(getClass().getResourceAsStream(xslLocation));
51  return indent(xml, suppressedElements, xslString);
52 
53  }
54 
55 
63  public String indent(String xml, String suppressedElements, String xslString) throws TransformerException, ParserConfigurationException, UnsupportedEncodingException, SAXException, IOException, XPathExpressionException {
64 
65 
66  // Turn xml string into a document
67  Document document = DocumentBuilderFactory.newInstance()
68  .newDocumentBuilder()
69  .parse(new InputSource(new ByteArrayInputStream(xml.getBytes("utf-8"))));
70 
71  // Remove whitespaces outside tags
72  document.normalize();
73  XPath xPath = XPathFactory.newInstance().newXPath();
74  NodeList nodeList = (NodeList) xPath.evaluate("//text()[normalize-space()='']",
75  document,
76  XPathConstants.NODESET);
77 
78  for (int i = 0; i < nodeList.getLength(); ++i) {
79  Node node = nodeList.item(i);
80  node.getParentNode().removeChild(node);
81  }
82 
83  // Setup pretty print options
84  // get the XSLT stylesheet and the XML base
85  //String xslString = TypeConverter.InputStream2String(de.uni_hamburg.corpora.utilities.PrettyPrinter.class.getClassLoader().getResourceAsStream("/xsl/pretty-print-sort-elements.xsl"));
86 
87  String xmlString = TypeConverter.W3cDocument2String(document);
88 
89  // create XSLTransformer and set the parameters
90  XSLTransformer xt = new XSLTransformer("net.sf.saxon.TransformerFactoryImpl");
91 
92  xt.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
93  xt.setOutputProperty(OutputKeys.VERSION, "1.0");
94  xt.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, xml.indexOf("<?xml") >= 0 ? "no" : "yes");
95  xt.setOutputProperty(OutputKeys.INDENT, "yes");
96  xt.setOutputProperty("suppress-indentation", suppressedElements);
97 
98 
99  // perform XSLT transformation
100  String prettyXmlString;
101  if(xslString.equals("")){
102  prettyXmlString = xt.transform(xmlString);
103 
104  } else{
105  prettyXmlString = xt.transform(xmlString, xslString);
106  }
107 
108 
109  /* insert some specific EXMARaLDA dialect styles */
110 
111  // insert a blank space at the end of empty elements
112  Pattern r1 = Pattern.compile("<([^>]+)([^>\\s])/>", Pattern.DOTALL);
113  prettyXmlString = r1.matcher(prettyXmlString).replaceAll("<$1$2 />");
114 
115  // insert explicit CDATA section for specific elements
116  Pattern r2 = Pattern.compile("<nts([^>]*)>([\\s]+)</nts>", Pattern.DOTALL);
117  prettyXmlString = r2.matcher(prettyXmlString).replaceAll("<nts$1><![CDATA[$2]]></nts>");
118 
119  // insert explicit CDATA section for specific elements
120  Pattern r2a = Pattern.compile("<event([^>]*)>([\\s]+)</event>", Pattern.DOTALL);
121  prettyXmlString = r2a.matcher(prettyXmlString).replaceAll("<event$1><![CDATA[$2]]></event>");
122 
123  // insert explicit CDATA section for specific elements
124  Pattern r2b = Pattern.compile("<ts([^>]*)>([\\s]+)</ts>", Pattern.DOTALL);
125  prettyXmlString = r2b.matcher(prettyXmlString).replaceAll("<ts$1><![CDATA[$2]]></ts>");
126 
127  // insert explicit CDATA section for specific elements
128  Pattern r2c = Pattern.compile("<ta([^>]*)>([\\s]+)</ta>", Pattern.DOTALL);
129  prettyXmlString = r2c.matcher(prettyXmlString).replaceAll("<ta$1><![CDATA[$2]]></ta>");
130 
131  // insert explicit CDATA section for specific elements
132  Pattern r2d = Pattern.compile("<ats([^>]*)>([\\s]+)</ats>", Pattern.DOTALL);
133  prettyXmlString = r2d.matcher(prettyXmlString).replaceAll("<ats$1><![CDATA[$2]]></ats>");
134 
135  // re-sort attributes for EXBs from alphabetic to EXB style
136  Pattern r3 = Pattern.compile("<event\\s*(end=\"[^\">]*\")\\s+(start=\"[^\">]*\")\\s*>", Pattern.DOTALL);
137  prettyXmlString = r3.matcher(prettyXmlString).replaceAll("<event $2 $1>");
138 
139  Pattern r4 = Pattern.compile("<tier\\s+(category=\"[^\">]*\")\\s+(display\\-name=\"[^\">]*\")\\s+(id=\"[^\">]*\")\\s+(speaker=\"[^\">]*\")\\s+(type=\"[^\">]*\")\\s*(/?)>", Pattern.DOTALL);
140  prettyXmlString = r4.matcher(prettyXmlString).replaceAll("<tier $3 $4 $1 $5 $2 $6>");
141 
142  // return certain empty elements from EXB with opening and closing tags
143  Pattern r5 = Pattern.compile("<(tier|event|ud\\-meta\\-information|languages\\-used|ud\\-speaker\\-information)([^/>]*?)\\s*/>", Pattern.DOTALL);
144  prettyXmlString = r5.matcher(prettyXmlString).replaceAll("<$1$2></$1>");
145 
146  return prettyXmlString;
147  }
148 
149 }
String indent(String xml, String suppressedElements)
static String InputStream2String(InputStream is)
String indent(String xml, String suppressedElements, String xslString)
static String W3cDocument2String(org.w3c.dom.Document doc)
void setOutputProperty(String propertyName, String propertyValue)