Attachment 'DigitalObjectBundleCreator.java'

Download

   1 package dk.statsbiblioteket.doms.disseminator;
   2 
   3 import java.io.ByteArrayInputStream;
   4 import java.io.ByteArrayOutputStream;
   5 import java.io.IOException;
   6 import java.io.InputStream;
   7 import java.net.MalformedURLException;
   8 import java.net.URL;
   9 import java.net.URLConnection;
  10 import java.util.ArrayList;
  11 import java.util.List;
  12 import java.util.Set;
  13 import java.util.regex.Matcher;
  14 import java.util.regex.Pattern;
  15 import javax.xml.parsers.DocumentBuilder;
  16 import javax.xml.parsers.DocumentBuilderFactory;
  17 import javax.xml.parsers.ParserConfigurationException;
  18 
  19 import fedora.client.FedoraClient;
  20 import org.apache.commons.logging.Log;
  21 import org.apache.commons.logging.LogFactory;
  22 import org.w3c.dom.Document;
  23 import org.w3c.dom.Element;
  24 import org.w3c.dom.NamedNodeMap;
  25 import org.w3c.dom.Node;
  26 import org.w3c.dom.NodeList;
  27 import sun.misc.BASE64Encoder;
  28 
  29 /**
  30  * Utility class for creating a digital object bundle
  31  */
  32 public class DigitalObjectBundleCreator {
  33     private static final Log LOG
  34             = LogFactory.getLog(DigitalObjectBundleCreator.class);
  35     private static final int BUFFER_BLOCKSIZE = 8192;
  36 
  37     /**
  38      * Package up a number of Fedora digital objects in a digital object bundle.
  39      * Will simply list one FoxML object after the other. All relations with
  40      * given predicates will be followed. Datastreams in a given list will be
  41      * inlined as inline XML in a CDATA BASE64-encoded section.
  42      *
  43      * @param startingURLs List of object URLs. The URLs may contain a
  44      * datastream name as well, it will be stripped off. URLs not recognised as
  45      * URLs to a fedora object will be ignored.
  46      * @param datastreamsToInclude List of IDs of datastreams to be inlined
  47      * @param relsToFollow
  48      * @return A document with the bundle
  49      *
  50      * @throws Error on exceptions that should never happen, like unable to
  51      * initialise default document builder.
  52      */
  53     public static Document createBundle(Set<String> startingURLs,
  54                                         Set<String> datastreamsToInclude,
  55                                         Set<String> relsToFollow)
  56             throws Error {
  57         //Initialise result document
  58         DocumentBuilder documentBuilder = getDocumentBuilder();
  59         Document result = documentBuilder.newDocument();
  60         Element topLevelElm = result.createElementNS(
  61                 "http://fedora.statsbiblioteket.dk/datatypes/digitalObjectBundle/",
  62                 "d:digitalObjectBundle");
  63         result.appendChild(topLevelElm);
  64         //TODO: For completeness - handle DOMExceptions as errors?
  65 
  66         List<String> urlList = new ArrayList<String>(startingURLs);
  67         for (int index = 0; index < urlList.size(); index++) {
  68             String url = urlList.get(index); 
  69             System.out.println("URL: " + url);
  70             //Strip the server and PID from the URL
  71             Matcher matcher = Pattern.compile("(.*)/get/([^/]+)(/.*)?").matcher(
  72                     url);
  73             if (!matcher.find()) {
  74                 LOG.warn("The url '" + url
  75                          + "' does not seem to be a Fedora URL. Ignoring");
  76                 //just ignore this URL
  77                 continue;
  78             }
  79             String server = matcher.group(1);
  80             String pid = matcher.group(2);
  81 
  82             //Fetch and parse the contents from Fedora
  83             String fedoraUsername = "fedora";
  84             String fedoraPassword = "fedora";
  85             //TODO: Real user/pass from properties
  86             Document fedoraObject;
  87             try {
  88                 fedoraObject = readFoxmlDocument(server, fedoraUsername,
  89                                                  fedoraPassword,
  90                                                  pid, documentBuilder);
  91             } catch (Exception e) {
  92                 LOG.error("Unable to get PID '" + pid + "' from server '"
  93                           + server + "'. Ignoring the URL '" + url + "'", e);
  94                 //unable to connect to fedora in that URL. Just ignore it.
  95                 continue;
  96             }
  97 
  98             //For all datastreams do
  99             NodeList datastreamNodes = fedoraObject.getElementsByTagNameNS(
 100                     "info:fedora/fedora-system:def/foxml#", "datastream");
 101             if (datastreamNodes != null) {
 102                 for (int i = 0; i < datastreamNodes.getLength(); i++) {
 103                     Node datastreamNode = datastreamNodes.item(i);
 104                     checkDatastream(datastreamNode, datastreamsToInclude,
 105                                     server, pid, fedoraObject);
 106 
 107                 }
 108             }
 109 
 110             //Drop contents in result
 111             Node importedD
 112                     = result.importNode(fedoraObject.getFirstChild(), true);
 113             topLevelElm.appendChild(importedD);
 114 
 115             //TODO: Currently follows relations from all datastreams
 116             NodeList rdfNodes = fedoraObject.getElementsByTagNameNS(
 117                     "http://www.w3.org/1999/02/22-rdf-syntax-ns#",
 118                     "Description");
 119 
 120             //Run through all relations
 121             for (int i = 0; i < rdfNodes.getLength(); i++) {
 122                 Node descriptionNode = rdfNodes.item(i);
 123                 //TODO: Check about-attribute
 124                 NodeList children = descriptionNode.getChildNodes();
 125                 for (int c = 0; c < children.getLength(); c++) {
 126                     Node child = children.item(c);
 127                     System.out.println("Child:" + child.getNodeName());
 128                     //for children that we should follow
 129                     if (child.getNodeType() == Node.ELEMENT_NODE
 130                         && relsToFollow.contains(child.getNodeName())) {
 131                         //find the endpoint
 132                         NamedNodeMap attributes = child.getAttributes();
 133                         if (attributes != null) {
 134                             System.out.println("Attrbute length: " + attributes.getLength());
 135                             Node resourceAttribute
 136                                     = attributes.getNamedItem("rdf:resource");
 137                             String relEndPoint
 138                                     = resourceAttribute.getNodeValue();
 139                             //add url to list of starting points
 140                             String newUrl;
 141                             if (relEndPoint.startsWith("info:fedora/")) {
 142                                 newUrl = server + "/get/"
 143                                                 + relEndPoint.substring(12)
 144                                                 + "/DC";
 145                             } else {
 146                                 newUrl = relEndPoint;
 147                             }
 148                             LOG.info("Adding '" + newUrl
 149                                      + "' to Bundle");
 150                             System.out.println("Adding '" + newUrl
 151                                      + "' to Bundle");
 152                             urlList.add(newUrl);
 153                         }
 154                     }
 155 
 156                 }
 157 
 158             }
 159 
 160         }
 161 
 162         return result;
 163     }
 164 
 165     private static void checkDatastream(Node datastreamNode,
 166                                         Set<String> datastreamsToInclude,
 167                                         String server, String pid,
 168                                         Document fedoraObject) {
 169 
 170         //Retain only newest version
 171         Node datastreamVersionNode = retainNewestVersion(
 172                 datastreamNode);
 173         //If name in datastreamsToInclude,
 174         String datastreamID = getDatastreamID(datastreamNode);
 175         if (datastreamID == null) {
 176             LOG.error("Malformed datastream ID. Ignoring");
 177         }
 178         if (datastreamsToInclude.contains(datastreamID)) {
 179             Node controlGroupAttribute
 180                     = datastreamNode.getAttributes().getNamedItem(
 181                     "CONTROL_GROUP");
 182             //TODO: Check for nulls
 183             if (controlGroupAttribute.getNodeValue().equals("R")
 184                 || controlGroupAttribute.getNodeValue().equals("E")
 185                 || controlGroupAttribute.getNodeValue().equals("M")) {
 186 
 187                 inlineDatastream(controlGroupAttribute, server,
 188                                  pid,
 189                                  datastreamID, fedoraObject,
 190                                  datastreamVersionNode);
 191             }
 192         }
 193     }
 194 
 195     private static void inlineDatastream(
 196             Node controlGroupAttribute,
 197             String server, String pid,
 198             String datastreamID,
 199             Document fedoraObject,
 200             Node datastreamVersionNode) {
 201         byte[] bytes = new byte[0];
 202         try {
 203             bytes = readContentData(controlGroupAttribute, server, pid,
 204                                     datastreamID,
 205                                     datastreamVersionNode);
 206         } catch (IOException e) {
 207             LOG.error("Unable to inline datastream '" + datastreamID + "' from"
 208                       + " object '" + pid + "'. Ignoring.", e);
 209             return;
 210         }
 211 
 212         //Replace datastream with xmlContents datastream
 213         while (datastreamVersionNode.getFirstChild()
 214                != null) {
 215             datastreamVersionNode.removeChild(
 216                     datastreamVersionNode.getFirstChild());
 217         }
 218         controlGroupAttribute.setNodeValue("X");
 219         Node xmlContent = fedoraObject.createElementNS(
 220                 "info:fedora/fedora-system:def/foxml#",
 221                 "foxml:xmlContent");
 222         datastreamVersionNode.appendChild(xmlContent);
 223 
 224         //Insert CDATA element with contents
 225         Node content = fedoraObject.createElementNS(
 226                 "http://fedora.statsbiblioteket.dk/datatypes/digitalObjectBundle/",
 227                 "d:content");
 228         xmlContent.appendChild(content);
 229         Node cdata = fedoraObject.createCDATASection(
 230                 new BASE64Encoder().encode(bytes));
 231         //TODO: BASE64Encoder is not a standard class!!
 232         content.appendChild(cdata);
 233     }
 234 
 235     private static byte[] readContentData(Node controlGroupAttribute,
 236                                           String server, String pid,
 237                                           String datastreamID,
 238                                           Node datastreamVersionNode)
 239             throws IOException {
 240         //url of the contents to inline
 241         URL contentURL = generateContentURL(controlGroupAttribute, server, pid,
 242                                             datastreamID,
 243                                             datastreamVersionNode);
 244 
 245         //Read data from URL
 246         //TODO: Large content read into memory!!
 247         URLConnection urlConnection = contentURL.openConnection();
 248         byte[] buffer = new byte[BUFFER_BLOCKSIZE];
 249         InputStream inputStream = urlConnection.getInputStream();
 250         ByteArrayOutputStream baos = new ByteArrayOutputStream();
 251         int read;
 252         while ((read = inputStream.read(buffer)) != -1) {
 253             baos.write(buffer, 0, read);
 254         }
 255         return baos.toByteArray();
 256     }
 257 
 258     private static URL generateContentURL(Node controlGroupAttribute,
 259                                           String server, String pid,
 260                                           String datastreamID,
 261                                           Node datastreamVersionNode)
 262             throws MalformedURLException {
 263         if (controlGroupAttribute.getNodeValue().equals("M")) {
 264             //generate URL from PID and REF
 265             return new URL(server + "/get/" + pid + "/" + datastreamID);
 266         } else {
 267             NodeList elms
 268                     = datastreamVersionNode.getChildNodes();
 269             for (int c = 0; c < elms.getLength(); c++) {
 270                 Node child = elms.item(c);
 271                 if (child.getNodeType() == Node.ELEMENT_NODE
 272                     && child.getNodeName().equals(
 273                         "foxml:contentLocation")) {
 274                     return new URL(child.getAttributes().getNamedItem(
 275                             "REF").getNodeValue());
 276                 }
 277             }
 278         }
 279         return null;
 280     }
 281 
 282     private static Node retainNewestVersion(
 283             Node datastreamNode) {
 284         NodeList children = datastreamNode.getChildNodes();
 285         String newestDate = findNewestDate(children);
 286         Node datastreamVersionNode = null;
 287         for (int c = 0; c < children.getLength(); c++) {
 288             Node child = children.item(c);
 289             if (child.getNodeType() == Node.ELEMENT_NODE
 290                 && child.getNodeName().equals(
 291                     "foxml:datastreamVersion")) {
 292                 Node date = child.getAttributes().getNamedItem("CREATED");
 293                 if (date.getNodeValue().compareTo(newestDate) < 0) {
 294                     child.getParentNode().removeChild(child);
 295                 } else {
 296                     datastreamVersionNode = child;
 297                 }
 298             }
 299         }
 300         return datastreamVersionNode;
 301     }
 302 
 303     private static String findNewestDate(NodeList children) {
 304         String newestDate = null;
 305         for (int c = 0; c < children.getLength(); c++) {
 306             Node child = children.item(c);
 307             if (child.getNodeType() == Node.ELEMENT_NODE
 308                 && child.getNodeName().equals(
 309                     "foxml:datastreamVersion")) {
 310                 Node date = child.getAttributes().getNamedItem("CREATED");
 311                 if (newestDate == null ||
 312                     date.getNodeValue().compareTo(newestDate) > 0) {
 313                     newestDate = date.getNodeValue();
 314                 }
 315             }
 316         }
 317         return newestDate;
 318     }
 319 
 320     /**
 321      * Get value of ID node of attributes, or null if it cannot be found.
 322      *
 323      * @param datastreamNode The datastream node
 324      * @return value of ID, or null for none.
 325      */
 326     private static String getDatastreamID(Node datastreamNode) {
 327         NamedNodeMap attributes = datastreamNode.getAttributes();
 328         if (attributes == null) {
 329             return null;
 330         }
 331         Node attribute = attributes.getNamedItem("ID");
 332         if (attribute == null) {
 333             return null;
 334         }
 335         String datastreamID = attribute.getNodeValue();
 336         return datastreamID;
 337     }
 338 
 339     /**
 340      * Read and parse a foxml document from Fedora.
 341      *
 342      * @param server Fedora server URL.
 343      * @param fedoraUsername Username.
 344      * @param fedoraPassword Password.
 345      * @param pid PID for object.
 346      * @param documentBuilder The doucment builder.
 347      * @return The parsed document
 348      *
 349      * @throws Exception On any trouble reading the document.
 350      */
 351     private static Document readFoxmlDocument(String server,
 352                                               String fedoraUsername,
 353                                               String fedoraPassword, String pid,
 354                                               DocumentBuilder documentBuilder)
 355             throws Exception {
 356         FedoraClient client = new FedoraClient(server, fedoraUsername,
 357                                                fedoraPassword);
 358         byte[] object = client.getAPIM().export(pid, "foxml1.0", "public");
 359         return documentBuilder.parse(new ByteArrayInputStream(
 360                 object));
 361     }
 362 
 363     private static DocumentBuilder getDocumentBuilder() {
 364         DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
 365         dbf.setNamespaceAware(true);
 366         DocumentBuilder documentBuilder;
 367         try {
 368             documentBuilder = dbf.newDocumentBuilder();
 369         } catch (ParserConfigurationException e) {
 370             throw new Error("Unable to initialise default document builder", e);
 371         }
 372         return documentBuilder;
 373     }
 374 }

Attached Files

To refer to attachments on a page, use attachment:filename, as shown below in the list of files. Do NOT use the URL of the [get] link, since this is subject to change and can break easily.
  • [get | view] (2008-06-26 13:22:45, 15.4 KB) [[attachment:DigitalObjectBundleCreator.java]]
  • [get | view] (2008-06-26 13:22:31, 1.6 KB) [[attachment:digitalObjectBundle.jsp]]
  • [get | view] (2008-06-26 13:22:37, 220.5 KB) [[attachment:digitalObjectBundle.xml]]
 All files | Selected Files: delete move to page copy to page

You are not allowed to attach a file to this page.