hzsk-corpus-services  1.0
HandlePidRegistrationPublication.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.publication;
7 
8 import com.sun.org.apache.xerces.internal.impl.dv.util.Base64;
15 import java.io.BufferedReader;
16 import java.io.FileNotFoundException;
17 import java.io.IOException;
18 import java.io.InputStream;
19 import java.io.InputStreamReader;
20 import java.io.OutputStreamWriter;
21 import java.net.HttpURLConnection;
22 import java.net.URL;
23 import java.text.DateFormat;
24 import java.text.SimpleDateFormat;
25 import java.util.Calendar;
26 import java.util.Collection;
27 import java.util.Date;
28 import javax.xml.parsers.ParserConfigurationException;
29 import javax.xml.transform.TransformerException;
30 import org.w3c.dom.Document;
31 import org.w3c.dom.Element;
32 import org.w3c.dom.Node;
33 import org.w3c.dom.NodeList;
34 import org.xml.sax.SAXException;
37 import java.io.UnsupportedEncodingException;
38 import javax.xml.xpath.XPathExpressionException;
39 
47 
48  String cmdiLoc = "";
49 
50  String EpicApiUser = ""; //e.g. 1008-01 for HZSK
51  String EpicApiPass = ""; //e.g. K******* for HZSK
52  String HandlePrefix = "11022"; // the default is the HZSK/CLARIN prefix 11022
53  String HandleEndpoint = "http://pid.gwdg.de/handles/";
54  String HandleUrlBase = "http://hdl.handle.net/";
55 
56  // names of XML elements in which URLs are found for which Handles shall be retrieved/registered
57  String[] ElementNames = {"MdSelfLink", "ResourceRef", "IsPartOf", "PID"};
58 
59 
60 
62  super();
63  }
64 
65  @Override
66  public Report function(CorpusData cd)
67  throws SAXException, IOException, ParserConfigurationException {
68 
69  Report stats = new Report();
70 
71  CmdiData cmdi = (CmdiData) cd;
72  Document doc = JdomDocument2W3cDocument(cmdi.getJdom());
73 
74  //optional, but recommended
75  //read this - http://stackoverflow.com/questions/13786607/normalization-in-dom-parsing-with-java-how-does-it-work
76  doc.getDocumentElement().normalize();
77 
78  Element root = doc.getDocumentElement();
79 
80  for (int x = 0; x < ElementNames.length; x++) {
81  NodeList nodes = root.getElementsByTagName(ElementNames[x]);
82  for (int i = 0; i < nodes.getLength(); i++) {
83  Node node = nodes.item(i).getFirstChild();
84  if (node == null) {
85  continue;
86  }
87  String oldURL = node.getTextContent();
88 
89  // test if the URL is already a Handle
90  if(oldURL.matches("^\\s*(https?://)?hdl\\.handle\\.net/.*$")){
91  stats.addWarning(function, cd, "URL is already a Handle PID: " + oldURL);
92  }
93  // if URL is not already a Handle
94  else{
95 
96  String newURL = oldURL;
97  String partIdentifier = "";
98  if(oldURL.matches("^.+/[A-Z0-9]{2,6}$") && !oldURL.endsWith("/CMDI")){
99  int endIndex = oldURL.lastIndexOf("/");
100  if(endIndex != -1){
101  partIdentifier = "@"+oldURL.substring(endIndex + 1);
102  oldURL = oldURL.substring(0, endIndex);
103  }
104  }
105 
106  /* get existing PID for this url */
107  String existingHandle = getPID(oldURL);
108 
109  /* there is a handle pid registered for this url already*/
110  if(existingHandle != null && !existingHandle.equals("")){
111  newURL = HandleUrlBase + HandlePrefix + "/" + existingHandle + partIdentifier;
112  newURL = newURL.replaceAll("[\\s\\n]+", "");
113  stats.addNote(function, cd, "Retrieved existing Handle PID for " + oldURL + ":\n" + newURL);
114  System.out.println("Retrieved existing Handle PID for " + oldURL + ":\n" + newURL);
115  }
116  /* for this url a new pid has to be registered */
117  else{
118  String newHandle = registerPID(oldURL);
119  newURL = HandleUrlBase + HandlePrefix + "/" + newHandle + partIdentifier;
120  newURL = newURL.replaceAll("[\\s\\n]+", "");
121  stats.addNote(function, cd, "Registered new Handle PID for " + oldURL + ":\n" + newURL);
122  System.out.println("Registered new Handle PID for " + oldURL + ":\n" + newURL);
123  }
124 
125  node.setNodeValue(newURL);
126  }
127  }
128  }
129 
130 
131  String newCmdiXmlInStringOneLine = TypeConverter.W3cDocument2String(doc);
132 
133  //make sure XML prolog is on own line (needed for jOAI provider)
134  //insert comment about automatic processing into file (between last processing instruction and directly following comment)
135  DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
136  Date today = Calendar.getInstance().getTime();
137  String newCmdiXmlInString = newCmdiXmlInStringOneLine.replaceAll("\\?>\\s*<!\\-\\-", "?>\n<!--Handle PIDs generated by HandlePIDs.java on "+df.format(today)+"-->\n<!--");
138 
139 
140  try {
141  CorpusIO cio = new CorpusIO();
142  cd.updateUnformattedString(newCmdiXmlInString);
143  cio.write(cd, cd.getURL());
144  stats.addFix(function, cd, "Handle PIDs were retrieved and file was updated.");
145  } catch (UnsupportedEncodingException ex) {
146  stats.addCritical(function, cd, "UnsupportedEncodingException: " + ex);
147  } catch (XPathExpressionException ex) {
148  stats.addCritical(function, cd, "XPathExpressionException: " + ex);
149  } catch (TransformerException ex) {
150  stats.addCritical(function, cd, "TransformerException: " + ex);
151  }
152 
153  return stats;
154  }
155 
156 
157  public String getPID(String handleURL)
158  throws IOException{
159 
160  //http://pid.gwdg.de/handles/11022?URL=http://www.corpora.uni-hamburg.de/repository
161 
162  String authString = EpicApiUser + ":" + EpicApiPass;
163  String authStringEnc = Base64.encode(authString.getBytes("UTF-8"));
164 
165  URL url = new URL(HandleEndpoint + HandlePrefix + "?URL=" + handleURL);
166  HttpURLConnection urlConnection = (HttpURLConnection) url.openConnection();
167  urlConnection.setRequestProperty("Authorization", "Basic " + authStringEnc);
168  urlConnection.setRequestProperty("Accept", "text/plain");
169  try {
170  int rc = urlConnection.getResponseCode();
171  InputStream is = urlConnection.getInputStream();
172  InputStreamReader isr = new InputStreamReader(is);
173 
174  int numCharsRead;
175  char[] charArray = new char[1024];
176  StringBuffer sb = new StringBuffer();
177  while ((numCharsRead = isr.read(charArray)) > 0) {
178  sb.append(charArray, 0, numCharsRead);
179  }
180 
181  return sb.toString();
182  } catch(FileNotFoundException fnfe) {
183  System.out.println("FileNotFound? " + fnfe.toString());
184  return null;
185  }
186  }
187 
188  public String registerPID(String handleURL)
189  throws ParserConfigurationException, SAXException, IOException{
190 
191  // "Accept:application/json" -H "Content-Type:application/json" -X POST --data '[{"type":"URL","parsed_data":"http://www.example.com/cmdi"}]' "http://pid.gwdg.de/handles/11022/"
192 
193  //http://pid.gwdg.de/handles/11022?URL=http://www.corpora.uni-hamburg.de/repository
194 
195  String authString = EpicApiUser + ":" + EpicApiPass;
196  String authStringEnc = Base64.encode(authString.getBytes("UTF-8"));
197 
198  URL object=new URL(HandleEndpoint + HandlePrefix + "/");
199  HttpURLConnection con = (HttpURLConnection) object.openConnection();
200  con.setRequestProperty("Authorization", "Basic " + authStringEnc);
201  con.setDoOutput(true);
202  con.setDoInput(true);
203  con.setRequestProperty("Content-Type", "application/json");
204  con.setRequestProperty("Accept", "application/xml");
205  con.setRequestMethod("POST");
206 
207  OutputStreamWriter wr= new OutputStreamWriter(con.getOutputStream());
208  wr.write("[{\"type\":\"URL\",\"parsed_data\":\""+handleURL+"\"}]");
209  wr.flush();
210 
211  //display what returns the POST request
212  StringBuilder sb = new StringBuilder();
213  BufferedReader br = new BufferedReader(new InputStreamReader(con.getInputStream(),"utf-8"));
214  String line = null;
215  while ((line = br.readLine()) != null) {
216  sb.append(line + "\n");
217  }
218  br.close();
219 
220  String responseInString = ""+sb.toString();
221  System.out.println(responseInString);
222  Document responseAsXml = TypeConverter.String2W3cDocument(responseInString);
223  Element root = responseAsXml.getDocumentElement();
224  NodeList nList = root.getElementsByTagName("dd");
225  String handlePID = nList.item(0).getFirstChild().getTextContent();
226  handlePID = handlePID.replaceAll("[\\s\\n]+", "");
227 
228  return handlePID;
229  }
230 
231  public void setUser(String user){
232  EpicApiUser = user;
233  }
234 
235  public void setPass(String pass){
236  EpicApiPass = pass;
237  }
238 
239  public void setHandlePrefix(String prefix){
240  HandlePrefix = prefix;
241  }
242 
243 
249  @Override
250  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
251  try {
252  Class cl = Class.forName("de.uni_hamburg.corpora.CmdiData");
253  IsUsableFor.add(cl);
254  } catch (ClassNotFoundException ex) {
255  report.addException(ex, " usable class not found");
256  }
257  return IsUsableFor;
258  }
259 
263  @Override
264  public String getDescription() {
265  String description = "This class loads CMDI data and retrieves already existing or newly registered "
266  + "Handle PIDs for URLs from specific XML elements.";
267  return description;
268  }
269 
270  @Override
271  public Report function(Corpus c) throws SAXException, IOException, ParserConfigurationException {
272  Report stats = new Report();
273  for(CmdiData cmdid : c.getCmdidata()){
274  stats.merge(function(cmdid));
275  }
276  return stats;
277  }
278 
279 }
void addNote(String statId, String description)
Definition: Report.java:245
void merge(Report sr)
Definition: Report.java:73
static org.w3c.dom.Document String2W3cDocument(String stringRespresentingDocument)
void addCritical(String description)
Definition: Report.java:104
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
Definition: Report.java:164
static String W3cDocument2String(org.w3c.dom.Document doc)
void addException(Throwable e, String description)
Definition: Report.java:287
void updateUnformattedString(String newUnformattedString)
void write(CorpusData cd, URL url)
Definition: CorpusIO.java:63
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155