corpus-services  1.0
CmdiChecker.java
Go to the documentation of this file.
1 
9 package de.uni_hamburg.corpora.validation;
10 
16 import java.io.IOException;
17 import java.util.Collection;
18 import javax.xml.parsers.ParserConfigurationException;
19 import org.w3c.dom.Document;
20 import org.w3c.dom.Element;
21 import org.w3c.dom.Node;
22 import org.w3c.dom.NodeList;
23 import org.xml.sax.SAXException;
25 
30 public class CmdiChecker extends Checker implements CorpusFunction {
31 
32  ValidatorSettings settings;
33 
34  public CmdiChecker() {
35  //no fix available
36  super(false);
37  }
38 
39  private boolean isUrlHandleOrHzsk(String url) {
40  if ((url.startsWith("http://hdl.handle.net/11022/")) ||
41  (url.startsWith("https://corpora.uni-hamburg.de/repository/")) ||
42  (url.startsWith("http://annis.corpora.uni-hamburg.de"))) {
43  return true;
44  } else {
45  return false;
46  }
47  }
48 
49  public Report function(CorpusData cd, Boolean fix)
50  throws SAXException, IOException, ParserConfigurationException {
51  CmdiData cmdi = (CmdiData) cd;
52  Document doc = JdomDocument2W3cDocument(cmdi.getJdom());
53  NodeList rps = doc.getElementsByTagName("ResourceProxy");
54  Report stats = new Report();
55  boolean hasLandingPage = false;
56  for (int i = 0; i < rps.getLength(); i++) {
57  Element rpe = (Element) rps.item(i);
58  NodeList restypes = rpe.getElementsByTagName("ResourceType");
59  Element restype = (Element) restypes.item(0);
60  if (restype.getTextContent().equals("LandingPage")) {
61  hasLandingPage = true;
62  stats.addCorrect(function, cd, "Good resource type LandingPage");
63  } else if (restype.getTextContent().equals("Resource")) {
64  stats.addCorrect(function, cd,
65  "Good resource type Resource");
66  } else if (restype.getTextContent().equals("SearchPage")) {
67  stats.addCorrect(function, cd,
68  "Good resource type SearchPage");
69  } else if (restype.getTextContent().equals("SearchService")) {
70  stats.addCorrect(function, cd,
71  "Good resource type SearchService");
72  } else if (restype.getTextContent().equals("Metadata")) {
73  stats.addCorrect(function, cd,
74  "Good resource type Metadata");
75  } else {
76  stats.addWarning(function, cd,
77  "Unrecognised resource type "
78  + restype.getTextContent());
79  }
80  NodeList resrefs = rpe.getElementsByTagName("ResourceRef");
81  Element resref = (Element) resrefs.item(0);
82  String url = resref.getTextContent();
83  if (!isUrlHandleOrHzsk(url)) {
84  stats.addCritical(function, cd,
85  "Invalid URL for reesource proxy:"
86  + url +
87  "URLs should start with http://hdl.handle.net... or "
88  + "https://corpora.uni-hamburg.de/repository/...");
89  } else {
90  stats.addCorrect(function, cd, "Good resource proxy URL " + url);
91  }
92  }
93  if (!hasLandingPage) {
94  stats.addCritical(function, cd, "Missing landing page");
95  } else {
96  stats.addCorrect(function, cd, "Good landing page found");
97  }
98  NodeList gis = doc.getElementsByTagName("GeneralInfo");
99  for (int i = 0; i < gis.getLength(); i++) {
100  Node ginode = gis.item(i);
101  if (ginode.getNodeType() != Node.ELEMENT_NODE) {
102  continue;
103  }
104  Element gi = (Element) ginode;
105  NodeList childs = gi.getChildNodes();
106  boolean englishTitle = false;
107  boolean englishDesc = false;
108  boolean legalOwner = false;
109  boolean pidFound = false;
110  for (int j = 0; j < childs.getLength(); j++) {
111  Node n = childs.item(j);
112  if (n.getNodeType() != Node.ELEMENT_NODE) {
113  continue;
114  }
115  Element e = (Element) n;
116  if (e.getTagName().equals("PID")) {
117  if (!isUrlHandleOrHzsk(e.getTextContent())) {
118  stats.addCritical(function, cd, "Invalid URL for PID:"
119  + e.getTextContent() +
120  "URLs should start with "
121  + "http://hdl.handle.net... or "
122  + "https://corpora.uni-hamburg.de/repository/...");
123  } else {
124  stats.addCorrect(function, cd, "Good PID URL: "
125  + e.getTextContent());
126  }
127  pidFound = true;
128  } else if (e.getTagName().equals("Description")) {
129  if (e.getAttribute("xml:lang").equals("en") || e.getAttribute("xml:lang").equals("eng")) {
130  englishDesc = true;
131  stats.addCorrect(function, cd, "English Description present");
132  }
133  } else if (e.getTagName().equals("Title")) {
134  if (e.getAttribute("xml:lang").equals("en") || e.getAttribute("xml:lang").equals("eng")) {
135  englishTitle = true;
136  stats.addCorrect(function, cd, "English title present");
137  }
138  } else if (e.getTagName().equals("LegalOwner")) {
139  legalOwner = true;
140  stats.addCorrect(function, cd, "LegalOwner present");
141  } else {
142  System.out.println("DEBUG: GeneralInfo/" + e.getTagName());
143  // pass
144  }
145  }
146  if (!englishTitle) {
147  stats.addWarning(function, cd, "English title missing from General Info "
148  + "(needed by FCS for example)");
149  }
150  if (!englishDesc) {
151  stats.addWarning(function, cd, "English Description missing from General Info "
152  + "(needed by FCS for example)");
153  }
154  if (!pidFound) {
155  stats.addCritical(function, cd, "PID missing");
156  }
157  }
158  NodeList cis = doc.getElementsByTagName("CorpusInfo");
159  for (int i = 0; i < cis.getLength(); i++) {
160  Node cinode = cis.item(i);
161  if (cinode.getNodeType() != Node.ELEMENT_NODE) {
162  continue;
163  }
164  Element ci = (Element) cis.item(i);
165  checkCorpusInfo(ci, stats, cd);
166  }
167  return stats;
168  }
169 
170  private void checkCorpusInfo(Element ci, Report stats, CorpusData cd) {
171  NodeList childs = ci.getChildNodes();
172  boolean corpusType = false;
173  boolean genre = false;
174  boolean modality = false;
175  boolean annotationTypes = false;
176  boolean timeCoverage = false;
177  for (int i = 0; i < childs.getLength(); i++) {
178  Node n = childs.item(i);
179  if (n.getNodeType() != Node.ELEMENT_NODE) {
180  continue;
181  }
182  Element e = (Element) n;
183  if (e.getTagName().equals("CorpusContext")) {
184  NodeList cts = e.getElementsByTagName("CorpusType");
185  if (cts.getLength() != 0) {
186  corpusType = true;
187  }
188  } else if (e.getTagName().equals("SubjectLanguages")) {
189  checkSubjectLanguages(e, stats, cd);
190  } else if (e.getTagName().equals("Coverage")) {
191  NodeList tcs = e.getElementsByTagName("TimeCoverage");
192  if (tcs.getLength() != 0) {
193  timeCoverage = true;
194  }
195  checkCoverage(e, stats, cd);
196  } else if (e.getTagName().equals("Content")) {
197  NodeList genres = e.getElementsByTagName("Genre");
198  if (genres.getLength() != 0) {
199  genre = true;
200  }
201  NodeList modalities = e.getElementsByTagName("Modalities");
202  if (modalities.getLength() != 0) {
203  modality = true;
204  }
205  } else {
206  //
207  System.out.println("DEBUG: CorpusInfo/" + e.getTagName());
208  }
209  }
210  if (!corpusType) {
211  stats.addCritical(function, cd, "Corpus type is needed for repo web pages");
212  } else {
213  stats.addCorrect(function, cd, "Corpus type included");
214  }
215  if (!genre) {
216  stats.addCritical(function, cd, "Genre is needed for repo web pages");
217  } else {
218  stats.addCorrect(function, cd, "Genre included");
219  }
220  if (!modality) {
221  stats.addCritical(function, cd, "Modality is needed for repo web pages");
222  } else {
223  stats.addCorrect(function, cd, "modality included");
224  }
225  if (!timeCoverage) {
226  stats.addWarning(function, cd, "time coverage is missing (recommended for VLO)");
227  }
228  }
229 
230  private void checkCoverage(Element coverage, Report stats, CorpusData cd) {
231  NodeList timeCoverages = coverage.getElementsByTagName("TimeCoverage");
232  for (int i = 0; i < timeCoverages.getLength(); i++) {
233  Node n = timeCoverages.item(i);
234  if (n.getNodeType() != Node.ELEMENT_NODE) {
235  continue;
236  }
237  Element e = (Element) n;
238  String tc = e.getTextContent();
239  if (tc.matches("[0-9]+/[0-9]+")) {
240  stats.addCorrect(function, cd, "Good time coverage");
241  } else {
242  stats.addCritical(function, cd, "TimeCoverage should be YYYY/YYYY for VLO");
243  }
244  }
245 
246  }
247 
248  private void checkSubjectLanguages(Element sls, Report stats, CorpusData cd) {
249  NodeList langs = sls.getElementsByTagName("Language");
250  for (int i = 0; i < langs.getLength(); i++) {
251  Node n = langs.item(i);
252  if (n.getNodeType() != Node.ELEMENT_NODE) {
253  continue;
254  }
255  Element e = (Element) n;
256  NodeList childs = e.getElementsByTagName("LanguageName");
257  boolean engFound = false;
258  for (int j = 0; j < childs.getLength(); j++) {
259  Element lang = (Element) childs.item(j);
260  if (lang.getAttribute("xml:lang").equals("eng")) {
261  engFound = true;
262  }
263  }
264  if (!engFound) {
265  stats.addCritical(function, cd, "Each subject language must have @xml:lang eng "
266  + "filled in");
267  } else {
268  stats.addCorrect(function, cd, "Goog language data");
269  }
270  }
271  }
272 
273 
279  @Override
280  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
281  try {
282  Class cl = Class.forName("de.uni_hamburg.corpora.CmdiData");
283  IsUsableFor.add(cl);
284  } catch (ClassNotFoundException ex) {
285  report.addException(ex, " usable class not found");
286  }
287  return IsUsableFor;
288  }
289 
293  @Override
294  public String getDescription() {
295  String description = "This class loads cmdi data and check for potential "
296  + "problems with HZSK repository depositing.";
297  return description;
298  }
299 
300  @Override
301  public Report function(Corpus c, Boolean fix) throws SAXException, IOException, ParserConfigurationException {
302  Report stats = new Report();
303  for(CmdiData cmdid : c.getCmdidata()){
304  stats.merge(function(cmdid, false));
305  }
306  return stats;
307  }
308 
309 }
Collection< Class<?extends CorpusData > > getIsUsableFor()
void merge(Report sr)
Definition: Report.java:73
void addCritical(String description)
Definition: Report.java:104
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
Definition: Report.java:164
void addCorrect(String statId, String description)
Definition: Report.java:217
void addException(Throwable e, String description)
Definition: Report.java:287