corpus-services  1.0
RemoveUnlinkedFiles.java
Go to the documentation of this file.
1 /*
2  * To change this license header, choose License Headers in Project Properties.
3  * To change this template file, choose Tools | Templates
4  * and open the template in the editor.
5  */
6 package de.uni_hamburg.corpora.publication;
7 
13 import java.io.File;
14 import java.io.IOException;
15 import java.util.ArrayList;
16 import java.util.Arrays;
17 import java.util.Collection;
18 import java.util.List;
19 import javax.xml.parsers.DocumentBuilder;
20 import javax.xml.parsers.DocumentBuilderFactory;
21 import javax.xml.parsers.ParserConfigurationException;
22 import javax.xml.transform.TransformerException;
23 import javax.xml.xpath.XPathExpressionException;
24 import org.w3c.dom.Document;
25 import org.w3c.dom.Element;
26 import org.w3c.dom.NodeList;
27 import org.xml.sax.SAXException;
28 
35 public class RemoveUnlinkedFiles extends Publisher implements CorpusFunction {
36 
37  List<String> fileList;
38  CorpusData comadata;
39  String baseDirectory;
40  final List<String> filenamewhitelist;
41 
43  super();
44  fileList = new ArrayList<String>();
45  //these are the files we don't want to remove even if they are not in Coma
46  filenamewhitelist = new ArrayList<String>();
47  filenamewhitelist.add("_score.html");
48  filenamewhitelist.add("_list.html");
49  }
50 
56  public Report removeFiles(CorpusData comadata) {
57 
58  Report stats = new Report();
59 
60  stats.addFix(function, comadata, "File:" + baseDirectory);
61 
62  // iterate through all files in the coma directory and its subdirectories
63  walk(baseDirectory, stats);
64  System.out.println("Done");
65 
66  return stats;
67  }
68 
76 
77  Report stats = new Report();
78 
79  try {
80 
81  //firstly add path of coma file itself because it is not linked in itself
82  fileList.add(cd.getFilename());
83 
84  DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
85  DocumentBuilder db = dbf.newDocumentBuilder();
86  Document doc = db.parse(TypeConverter.String2InputStream(cd.toSaveableString())); // get the file as a document
87 
88  //list of names of elements that provide a reference to a corpus file
89  List<String> elementNames = Arrays.asList("relPath", "NSLink");
90 
91  //iterate through elements and save file paths to fileList
92  for (int i = 0; i < elementNames.size(); i++) {
93  NodeList elements = doc.getElementsByTagName(elementNames.get(i));
94  for (int j = 0; j < elements.getLength(); j++) {
95  Element e = (Element) elements.item(j);
96  String c = e.getTextContent();
97  c = c.replace('/', File.separatorChar).replace('\\', File.separatorChar);
98  //c = c.substring(c.lastIndexOf(baseDirectory.replace('/', File.separatorChar).replace('\\', File.separatorChar)) + 1);
99  fileList.add(c);
100  }
101  }
102 
103  } catch (ParserConfigurationException ex) {
104  stats.addException(ex, function, cd, "Unknown ParserConfigurationException.");
105  } catch (TransformerException ex) {
106  stats.addException(ex, function, cd, "Unknown TransformerException.");
107  } catch (SAXException ex) {
108  stats.addException(ex, function, cd, "Unknown SAXException.");
109  } catch (IOException ex) {
110  stats.addException(ex, function, cd, "Unknown IOException.");
111  } catch (XPathExpressionException ex) {
112  stats.addException(ex, function, cd, "Unknown XPathExpressionException.");
113  }
114 
115  return stats;
116  }
117 
118  public void walk(String path, Report stats) {
119 
120  File dir = new File(path);
121  File[] foundFiles = dir.listFiles();
122 
123  for (File file : foundFiles) {
124  if (file.isDirectory() && (!file.getName().startsWith("."))) {
125  //disregard directories starting with "." and go further
126  walk(file.getAbsolutePath(), stats);
127  } else if (!file.getName().startsWith(".")) {
128 
129  // see if this file is in the file list from Coma
130  // if it is not, then remove it from disk
131  String name = file.getAbsolutePath().replace('/', File.separatorChar).replace('\\', File.separatorChar);
132 
133  //iterate through files linked in Coma and see if the current one is there
134  Boolean keepFile = false;
135  for (int i = 0; i < fileList.size(); i++) {
136  String linkedFile = fileList.get(i);
137  if (name.endsWith(linkedFile) || name.endsWith(filenamewhitelist.get(0)) || name.endsWith(filenamewhitelist.get(1))) {
138  keepFile = true;
139  }
140  }
141 
142  if (keepFile) {
143  stats.addNote(function, comadata, "Keeping: " + name + " (found in Coma and file system).");
144  } else {
145  File FileToRemove = new File(name);
146  if (FileToRemove.delete()) {
147  stats.addNote(function, comadata, "Removed: " + name + " (not found in Coma).");
148  } else {
149  stats.addWarning(function, comadata, "Removal unsuccessful: " + name + " (not found in Coma).");
150  }
151  }
152  }
153  }
154 
155  }
156 
157  @Override
158  public Report function(CorpusData cd) {
159 
160  //path of coma file
161  comadata = cd;
162  baseDirectory = comadata.getParentURL().getPath();
163 
164  Report stats = new Report();
165  stats = generateFileList(cd);
166  stats.merge(removeFiles(cd));
167  return stats;
168  }
169 
170  @Override
171  public Report function(Corpus c) {
172 
173  //path of coma file
174  comadata = c.getComaData();
175  baseDirectory = comadata.getParentURL().getPath();
176 
177  Report stats = new Report();
178  stats = generateFileList(cd);
179  stats.merge(removeFiles(cd));
180  return stats;
181  }
182 
183  @Override
184  public Collection<Class<? extends CorpusData>> getIsUsableFor() {
185  try {
186  Class cl = Class.forName("de.uni_hamburg.corpora.ComaData");
187  IsUsableFor.add(cl);
188  } catch (ClassNotFoundException ex) {
189  report.addException(ex, "Usable class not found.");
190  }
191  return IsUsableFor;
192  }
193 
194  @Override
195  public String getDescription() {
196  String description = "This class takes a coma file and removes all files from the directory/subdirectories "
197  + "which are not linked somewhere in the coma file. ";
198  return description;
199  }
200 
201 }
Collection< Class<?extends CorpusData > > getIsUsableFor()
void addNote(String statId, String description)
Definition: Report.java:245
void merge(Report sr)
Definition: Report.java:73
void addWarning(String statId, String description)
Definition: Report.java:164
static InputStream String2InputStream(String s)
void addException(Throwable e, String description)
Definition: Report.java:287
void addFix(String statId, CorpusData cd, String description)
Definition: Report.java:155