9 package de.uni_hamburg.corpora.validation;
16 import java.io.IOException;
17 import java.util.Collection;
18 import javax.xml.parsers.ParserConfigurationException;
19 import org.w3c.dom.Document;
20 import org.w3c.dom.Element;
21 import org.w3c.dom.Node;
22 import org.w3c.dom.NodeList;
23 import org.xml.sax.SAXException;
39 private boolean isUrlHandleOrHzsk(String url) {
40 if ((url.startsWith(
"http://hdl.handle.net/11022/")) ||
41 (url.startsWith(
"https://corpora.uni-hamburg.de/repository/")) ||
42 (url.startsWith(
"http://annis.corpora.uni-hamburg.de"))) {
50 throws SAXException, IOException, ParserConfigurationException {
52 Document doc = JdomDocument2W3cDocument(cmdi.
getJdom());
53 NodeList rps = doc.getElementsByTagName(
"ResourceProxy");
55 boolean hasLandingPage =
false;
56 for (
int i = 0; i < rps.getLength(); i++) {
57 Element rpe = (Element) rps.item(i);
58 NodeList restypes = rpe.getElementsByTagName(
"ResourceType");
59 Element restype = (Element) restypes.item(0);
60 if (restype.getTextContent().equals(
"LandingPage")) {
61 hasLandingPage =
true;
62 stats.
addCorrect(
function, cd,
"Good resource type LandingPage");
63 }
else if (restype.getTextContent().equals(
"Resource")) {
65 "Good resource type Resource");
66 }
else if (restype.getTextContent().equals(
"SearchPage")) {
68 "Good resource type SearchPage");
69 }
else if (restype.getTextContent().equals(
"SearchService")) {
71 "Good resource type SearchService");
72 }
else if (restype.getTextContent().equals(
"Metadata")) {
74 "Good resource type Metadata");
77 "Unrecognised resource type " 78 + restype.getTextContent());
80 NodeList resrefs = rpe.getElementsByTagName(
"ResourceRef");
81 Element resref = (Element) resrefs.item(0);
82 String url = resref.getTextContent();
83 if (!isUrlHandleOrHzsk(url)) {
85 "Invalid URL for reesource proxy:" 87 "URLs should start with http://hdl.handle.net... or " 88 +
"https://corpora.uni-hamburg.de/repository/...");
90 stats.
addCorrect(
function, cd,
"Good resource proxy URL " + url);
93 if (!hasLandingPage) {
94 stats.
addCritical(
function, cd,
"Missing landing page");
96 stats.
addCorrect(
function, cd,
"Good landing page found");
98 NodeList gis = doc.getElementsByTagName(
"GeneralInfo");
99 for (
int i = 0; i < gis.getLength(); i++) {
100 Node ginode = gis.item(i);
101 if (ginode.getNodeType() != Node.ELEMENT_NODE) {
104 Element gi = (Element) ginode;
105 NodeList childs = gi.getChildNodes();
106 boolean englishTitle =
false;
107 boolean englishDesc =
false;
108 boolean legalOwner =
false;
109 boolean pidFound =
false;
110 for (
int j = 0; j < childs.getLength(); j++) {
111 Node n = childs.item(j);
112 if (n.getNodeType() != Node.ELEMENT_NODE) {
115 Element e = (Element) n;
116 if (e.getTagName().equals(
"PID")) {
117 if (!isUrlHandleOrHzsk(e.getTextContent())) {
118 stats.
addCritical(
function, cd,
"Invalid URL for PID:" 119 + e.getTextContent() +
120 "URLs should start with " 121 +
"http://hdl.handle.net... or " 122 +
"https://corpora.uni-hamburg.de/repository/...");
124 stats.
addCorrect(
function, cd,
"Good PID URL: " 125 + e.getTextContent());
128 }
else if (e.getTagName().equals(
"Description")) {
129 if (e.getAttribute(
"xml:lang").equals(
"en") || e.getAttribute(
"xml:lang").equals(
"eng")) {
131 stats.
addCorrect(
function, cd,
"English Description present");
133 }
else if (e.getTagName().equals(
"Title")) {
134 if (e.getAttribute(
"xml:lang").equals(
"en") || e.getAttribute(
"xml:lang").equals(
"eng")) {
136 stats.
addCorrect(
function, cd,
"English title present");
138 }
else if (e.getTagName().equals(
"LegalOwner")) {
140 stats.
addCorrect(
function, cd,
"LegalOwner present");
142 System.out.println(
"DEBUG: GeneralInfo/" + e.getTagName());
147 stats.
addWarning(
function, cd,
"English title missing from General Info " 148 +
"(needed by FCS for example)");
151 stats.
addWarning(
function, cd,
"English Description missing from General Info " 152 +
"(needed by FCS for example)");
158 NodeList cis = doc.getElementsByTagName(
"CorpusInfo");
159 for (
int i = 0; i < cis.getLength(); i++) {
160 Node cinode = cis.item(i);
161 if (cinode.getNodeType() != Node.ELEMENT_NODE) {
164 Element ci = (Element) cis.item(i);
165 checkCorpusInfo(ci, stats, cd);
171 NodeList childs = ci.getChildNodes();
172 boolean corpusType =
false;
173 boolean genre =
false;
174 boolean modality =
false;
175 boolean annotationTypes =
false;
176 boolean timeCoverage =
false;
177 for (
int i = 0; i < childs.getLength(); i++) {
178 Node n = childs.item(i);
179 if (n.getNodeType() != Node.ELEMENT_NODE) {
182 Element e = (Element) n;
183 if (e.getTagName().equals(
"CorpusContext")) {
184 NodeList cts = e.getElementsByTagName(
"CorpusType");
185 if (cts.getLength() != 0) {
188 }
else if (e.getTagName().equals(
"SubjectLanguages")) {
189 checkSubjectLanguages(e, stats, cd);
190 }
else if (e.getTagName().equals(
"Coverage")) {
191 NodeList tcs = e.getElementsByTagName(
"TimeCoverage");
192 if (tcs.getLength() != 0) {
195 checkCoverage(e, stats, cd);
196 }
else if (e.getTagName().equals(
"Content")) {
197 NodeList genres = e.getElementsByTagName(
"Genre");
198 if (genres.getLength() != 0) {
201 NodeList modalities = e.getElementsByTagName(
"Modalities");
202 if (modalities.getLength() != 0) {
207 System.out.println(
"DEBUG: CorpusInfo/" + e.getTagName());
211 stats.
addCritical(
function, cd,
"Corpus type is needed for repo web pages");
213 stats.
addCorrect(
function, cd,
"Corpus type included");
216 stats.
addCritical(
function, cd,
"Genre is needed for repo web pages");
218 stats.
addCorrect(
function, cd,
"Genre included");
221 stats.
addCritical(
function, cd,
"Modality is needed for repo web pages");
223 stats.
addCorrect(
function, cd,
"modality included");
226 stats.
addWarning(
function, cd,
"time coverage is missing (recommended for VLO)");
231 NodeList timeCoverages = coverage.getElementsByTagName(
"TimeCoverage");
232 for (
int i = 0; i < timeCoverages.getLength(); i++) {
233 Node n = timeCoverages.item(i);
234 if (n.getNodeType() != Node.ELEMENT_NODE) {
237 Element e = (Element) n;
238 String tc = e.getTextContent();
239 if (tc.matches(
"[0-9]+/[0-9]+")) {
240 stats.
addCorrect(
function, cd,
"Good time coverage");
242 stats.
addCritical(
function, cd,
"TimeCoverage should be YYYY/YYYY for VLO");
248 private void checkSubjectLanguages(Element sls,
Report stats,
CorpusData cd) {
249 NodeList langs = sls.getElementsByTagName(
"Language");
250 for (
int i = 0; i < langs.getLength(); i++) {
251 Node n = langs.item(i);
252 if (n.getNodeType() != Node.ELEMENT_NODE) {
255 Element e = (Element) n;
256 NodeList childs = e.getElementsByTagName(
"LanguageName");
257 boolean engFound =
false;
258 for (
int j = 0; j < childs.getLength(); j++) {
259 Element lang = (Element) childs.item(j);
260 if (lang.getAttribute(
"xml:lang").equals(
"eng")) {
265 stats.
addCritical(
function, cd,
"Each subject language must have @xml:lang eng " 268 stats.
addCorrect(
function, cd,
"Goog language data");
282 Class cl = Class.forName(
"de.uni_hamburg.corpora.CmdiData");
284 }
catch (ClassNotFoundException ex) {
295 String description =
"This class loads cmdi data and check for potential " 296 +
"problems with HZSK repository depositing.";
301 public Report function(
Corpus c, Boolean fix)
throws SAXException, IOException, ParserConfigurationException {
303 for(
CmdiData cmdid : c.getCmdidata()){
304 stats.
merge(
function(cmdid,
false));
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addCritical(String description)
static org.w3c.dom.Document JdomDocument2W3cDocument(org.jdom.Document jdomDoc)
void addWarning(String statId, String description)
void addCorrect(String statId, String description)
void addException(Throwable e, String description)