Attachment ''
Download 1 package dk.statsbiblioteket.doms.disseminator;
3 import;
4 import;
5 import;
6 import;
7 import;
8 import;
9 import;
10 import java.util.ArrayList;
11 import java.util.List;
12 import java.util.Set;
13 import java.util.regex.Matcher;
14 import java.util.regex.Pattern;
15 import javax.xml.parsers.DocumentBuilder;
16 import javax.xml.parsers.DocumentBuilderFactory;
17 import javax.xml.parsers.ParserConfigurationException;
19 import fedora.client.FedoraClient;
20 import org.apache.commons.logging.Log;
21 import org.apache.commons.logging.LogFactory;
22 import org.w3c.dom.Document;
23 import org.w3c.dom.Element;
24 import org.w3c.dom.NamedNodeMap;
25 import org.w3c.dom.Node;
26 import org.w3c.dom.NodeList;
27 import sun.misc.BASE64Encoder;
29 /**
30 * Utility class for creating a digital object bundle
31 */
32 public class DigitalObjectBundleCreator {
33 private static final Log LOG
34 = LogFactory.getLog(DigitalObjectBundleCreator.class);
35 private static final int BUFFER_BLOCKSIZE = 8192;
37 /**
38 * Package up a number of Fedora digital objects in a digital object bundle.
39 * Will simply list one FoxML object after the other. All relations with
40 * given predicates will be followed. Datastreams in a given list will be
41 * inlined as inline XML in a CDATA BASE64-encoded section.
42 *
43 * @param startingURLs List of object URLs. The URLs may contain a
44 * datastream name as well, it will be stripped off. URLs not recognised as
45 * URLs to a fedora object will be ignored.
46 * @param datastreamsToInclude List of IDs of datastreams to be inlined
47 * @param relsToFollow
48 * @return A document with the bundle
49 *
50 * @throws Error on exceptions that should never happen, like unable to
51 * initialise default document builder.
52 */
53 public static Document createBundle(Set<String> startingURLs,
54 Set<String> datastreamsToInclude,
55 Set<String> relsToFollow)
56 throws Error {
57 //Initialise result document
58 DocumentBuilder documentBuilder = getDocumentBuilder();
59 Document result = documentBuilder.newDocument();
60 Element topLevelElm = result.createElementNS(
61 "",
62 "d:digitalObjectBundle");
63 result.appendChild(topLevelElm);
64 //TODO: For completeness - handle DOMExceptions as errors?
66 List<String> urlList = new ArrayList<String>(startingURLs);
67 for (int index = 0; index < urlList.size(); index++) {
68 String url = urlList.get(index);
69 System.out.println("URL: " + url);
70 //Strip the server and PID from the URL
71 Matcher matcher = Pattern.compile("(.*)/get/([^/]+)(/.*)?").matcher(
72 url);
73 if (!matcher.find()) {
74 LOG.warn("The url '" + url
75 + "' does not seem to be a Fedora URL. Ignoring");
76 //just ignore this URL
77 continue;
78 }
79 String server =;
80 String pid =;
82 //Fetch and parse the contents from Fedora
83 String fedoraUsername = "fedora";
84 String fedoraPassword = "fedora";
85 //TODO: Real user/pass from properties
86 Document fedoraObject;
87 try {
88 fedoraObject = readFoxmlDocument(server, fedoraUsername,
89 fedoraPassword,
90 pid, documentBuilder);
91 } catch (Exception e) {
92 LOG.error("Unable to get PID '" + pid + "' from server '"
93 + server + "'. Ignoring the URL '" + url + "'", e);
94 //unable to connect to fedora in that URL. Just ignore it.
95 continue;
96 }
98 //For all datastreams do
99 NodeList datastreamNodes = fedoraObject.getElementsByTagNameNS(
100 "info:fedora/fedora-system:def/foxml#", "datastream");
101 if (datastreamNodes != null) {
102 for (int i = 0; i < datastreamNodes.getLength(); i++) {
103 Node datastreamNode = datastreamNodes.item(i);
104 checkDatastream(datastreamNode, datastreamsToInclude,
105 server, pid, fedoraObject);
107 }
108 }
110 //Drop contents in result
111 Node importedD
112 = result.importNode(fedoraObject.getFirstChild(), true);
113 topLevelElm.appendChild(importedD);
115 //TODO: Currently follows relations from all datastreams
116 NodeList rdfNodes = fedoraObject.getElementsByTagNameNS(
117 "",
118 "Description");
120 //Run through all relations
121 for (int i = 0; i < rdfNodes.getLength(); i++) {
122 Node descriptionNode = rdfNodes.item(i);
123 //TODO: Check about-attribute
124 NodeList children = descriptionNode.getChildNodes();
125 for (int c = 0; c < children.getLength(); c++) {
126 Node child = children.item(c);
127 System.out.println("Child:" + child.getNodeName());
128 //for children that we should follow
129 if (child.getNodeType() == Node.ELEMENT_NODE
130 && relsToFollow.contains(child.getNodeName())) {
131 //find the endpoint
132 NamedNodeMap attributes = child.getAttributes();
133 if (attributes != null) {
134 System.out.println("Attrbute length: " + attributes.getLength());
135 Node resourceAttribute
136 = attributes.getNamedItem("rdf:resource");
137 String relEndPoint
138 = resourceAttribute.getNodeValue();
139 //add url to list of starting points
140 String newUrl;
141 if (relEndPoint.startsWith("info:fedora/")) {
142 newUrl = server + "/get/"
143 + relEndPoint.substring(12)
144 + "/DC";
145 } else {
146 newUrl = relEndPoint;
147 }
148"Adding '" + newUrl
149 + "' to Bundle");
150 System.out.println("Adding '" + newUrl
151 + "' to Bundle");
152 urlList.add(newUrl);
153 }
154 }
156 }
158 }
160 }
162 return result;
163 }
165 private static void checkDatastream(Node datastreamNode,
166 Set<String> datastreamsToInclude,
167 String server, String pid,
168 Document fedoraObject) {
170 //Retain only newest version
171 Node datastreamVersionNode = retainNewestVersion(
172 datastreamNode);
173 //If name in datastreamsToInclude,
174 String datastreamID = getDatastreamID(datastreamNode);
175 if (datastreamID == null) {
176 LOG.error("Malformed datastream ID. Ignoring");
177 }
178 if (datastreamsToInclude.contains(datastreamID)) {
179 Node controlGroupAttribute
180 = datastreamNode.getAttributes().getNamedItem(
182 //TODO: Check for nulls
183 if (controlGroupAttribute.getNodeValue().equals("R")
184 || controlGroupAttribute.getNodeValue().equals("E")
185 || controlGroupAttribute.getNodeValue().equals("M")) {
187 inlineDatastream(controlGroupAttribute, server,
188 pid,
189 datastreamID, fedoraObject,
190 datastreamVersionNode);
191 }
192 }
193 }
195 private static void inlineDatastream(
196 Node controlGroupAttribute,
197 String server, String pid,
198 String datastreamID,
199 Document fedoraObject,
200 Node datastreamVersionNode) {
201 byte[] bytes = new byte[0];
202 try {
203 bytes = readContentData(controlGroupAttribute, server, pid,
204 datastreamID,
205 datastreamVersionNode);
206 } catch (IOException e) {
207 LOG.error("Unable to inline datastream '" + datastreamID + "' from"
208 + " object '" + pid + "'. Ignoring.", e);
209 return;
210 }
212 //Replace datastream with xmlContents datastream
213 while (datastreamVersionNode.getFirstChild()
214 != null) {
215 datastreamVersionNode.removeChild(
216 datastreamVersionNode.getFirstChild());
217 }
218 controlGroupAttribute.setNodeValue("X");
219 Node xmlContent = fedoraObject.createElementNS(
220 "info:fedora/fedora-system:def/foxml#",
221 "foxml:xmlContent");
222 datastreamVersionNode.appendChild(xmlContent);
224 //Insert CDATA element with contents
225 Node content = fedoraObject.createElementNS(
226 "",
227 "d:content");
228 xmlContent.appendChild(content);
229 Node cdata = fedoraObject.createCDATASection(
230 new BASE64Encoder().encode(bytes));
231 //TODO: BASE64Encoder is not a standard class!!
232 content.appendChild(cdata);
233 }
235 private static byte[] readContentData(Node controlGroupAttribute,
236 String server, String pid,
237 String datastreamID,
238 Node datastreamVersionNode)
239 throws IOException {
240 //url of the contents to inline
241 URL contentURL = generateContentURL(controlGroupAttribute, server, pid,
242 datastreamID,
243 datastreamVersionNode);
245 //Read data from URL
246 //TODO: Large content read into memory!!
247 URLConnection urlConnection = contentURL.openConnection();
248 byte[] buffer = new byte[BUFFER_BLOCKSIZE];
249 InputStream inputStream = urlConnection.getInputStream();
250 ByteArrayOutputStream baos = new ByteArrayOutputStream();
251 int read;
252 while ((read = != -1) {
253 baos.write(buffer, 0, read);
254 }
255 return baos.toByteArray();
256 }
258 private static URL generateContentURL(Node controlGroupAttribute,
259 String server, String pid,
260 String datastreamID,
261 Node datastreamVersionNode)
262 throws MalformedURLException {
263 if (controlGroupAttribute.getNodeValue().equals("M")) {
264 //generate URL from PID and REF
265 return new URL(server + "/get/" + pid + "/" + datastreamID);
266 } else {
267 NodeList elms
268 = datastreamVersionNode.getChildNodes();
269 for (int c = 0; c < elms.getLength(); c++) {
270 Node child = elms.item(c);
271 if (child.getNodeType() == Node.ELEMENT_NODE
272 && child.getNodeName().equals(
273 "foxml:contentLocation")) {
274 return new URL(child.getAttributes().getNamedItem(
275 "REF").getNodeValue());
276 }
277 }
278 }
279 return null;
280 }
282 private static Node retainNewestVersion(
283 Node datastreamNode) {
284 NodeList children = datastreamNode.getChildNodes();
285 String newestDate = findNewestDate(children);
286 Node datastreamVersionNode = null;
287 for (int c = 0; c < children.getLength(); c++) {
288 Node child = children.item(c);
289 if (child.getNodeType() == Node.ELEMENT_NODE
290 && child.getNodeName().equals(
291 "foxml:datastreamVersion")) {
292 Node date = child.getAttributes().getNamedItem("CREATED");
293 if (date.getNodeValue().compareTo(newestDate) < 0) {
294 child.getParentNode().removeChild(child);
295 } else {
296 datastreamVersionNode = child;
297 }
298 }
299 }
300 return datastreamVersionNode;
301 }
303 private static String findNewestDate(NodeList children) {
304 String newestDate = null;
305 for (int c = 0; c < children.getLength(); c++) {
306 Node child = children.item(c);
307 if (child.getNodeType() == Node.ELEMENT_NODE
308 && child.getNodeName().equals(
309 "foxml:datastreamVersion")) {
310 Node date = child.getAttributes().getNamedItem("CREATED");
311 if (newestDate == null ||
312 date.getNodeValue().compareTo(newestDate) > 0) {
313 newestDate = date.getNodeValue();
314 }
315 }
316 }
317 return newestDate;
318 }
320 /**
321 * Get value of ID node of attributes, or null if it cannot be found.
322 *
323 * @param datastreamNode The datastream node
324 * @return value of ID, or null for none.
325 */
326 private static String getDatastreamID(Node datastreamNode) {
327 NamedNodeMap attributes = datastreamNode.getAttributes();
328 if (attributes == null) {
329 return null;
330 }
331 Node attribute = attributes.getNamedItem("ID");
332 if (attribute == null) {
333 return null;
334 }
335 String datastreamID = attribute.getNodeValue();
336 return datastreamID;
337 }
339 /**
340 * Read and parse a foxml document from Fedora.
341 *
342 * @param server Fedora server URL.
343 * @param fedoraUsername Username.
344 * @param fedoraPassword Password.
345 * @param pid PID for object.
346 * @param documentBuilder The doucment builder.
347 * @return The parsed document
348 *
349 * @throws Exception On any trouble reading the document.
350 */
351 private static Document readFoxmlDocument(String server,
352 String fedoraUsername,
353 String fedoraPassword, String pid,
354 DocumentBuilder documentBuilder)
355 throws Exception {
356 FedoraClient client = new FedoraClient(server, fedoraUsername,
357 fedoraPassword);
358 byte[] object = client.getAPIM().export(pid, "foxml1.0", "public");
359 return documentBuilder.parse(new ByteArrayInputStream(
360 object));
361 }
363 private static DocumentBuilder getDocumentBuilder() {
364 DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
365 dbf.setNamespaceAware(true);
366 DocumentBuilder documentBuilder;
367 try {
368 documentBuilder = dbf.newDocumentBuilder();
369 } catch (ParserConfigurationException e) {
370 throw new Error("Unable to initialise default document builder", e);
371 }
372 return documentBuilder;
373 }
374 }
Attached Files
To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.You are not allowed to attach a file to this page.