001: /*
002: * METSExport.java
003: *
004: * Version: $Revision: 2174 $
005: *
006: * Date: $Date: 2007-08-30 06:10:07 -0500 (Thu, 30 Aug 2007) $
007: *
008: * Copyright (c) 2002-2005, Hewlett-Packard Company and Massachusetts
009: * Institute of Technology. All rights reserved.
010: *
011: * Redistribution and use in source and binary forms, with or without
012: * modification, are permitted provided that the following conditions are
013: * met:
014: *
015: * - Redistributions of source code must retain the above copyright
016: * notice, this list of conditions and the following disclaimer.
017: *
018: * - Redistributions in binary form must reproduce the above copyright
019: * notice, this list of conditions and the following disclaimer in the
020: * documentation and/or other materials provided with the distribution.
021: *
022: * - Neither the name of the Hewlett-Packard Company nor the name of the
023: * Massachusetts Institute of Technology nor the names of their
024: * contributors may be used to endorse or promote products derived from
025: * this software without specific prior written permission.
026: *
027: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
028: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
029: * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
030: * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
031: * HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
032: * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
033: * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
034: * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
035: * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
036: * TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
037: * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
038: * DAMAGE.
039: */
040: package org.dspace.app.mets;
041:
042: import java.io.File;
043: import java.io.FileInputStream;
044: import java.io.FileOutputStream;
045: import java.io.IOException;
046: import java.io.InputStream;
047: import java.io.OutputStream;
048: import java.net.URLEncoder;
049: import java.sql.SQLException;
050: import java.util.Date;
051: import java.util.Properties;
052:
053: import org.apache.commons.cli.CommandLine;
054: import org.apache.commons.cli.CommandLineParser;
055: import org.apache.commons.cli.HelpFormatter;
056: import org.apache.commons.cli.Options;
057: import org.apache.commons.cli.PosixParser;
058: import org.dspace.authorize.AuthorizeException;
059: import org.dspace.authorize.AuthorizeManager;
060: import org.dspace.content.Bitstream;
061: import org.dspace.content.BitstreamFormat;
062: import org.dspace.content.Bundle;
063: import org.dspace.content.Collection;
064: import org.dspace.content.DCValue;
065: import org.dspace.content.DSpaceObject;
066: import org.dspace.content.Item;
067: import org.dspace.content.ItemIterator;
068: import org.dspace.core.ConfigurationManager;
069: import org.dspace.core.Constants;
070: import org.dspace.core.Context;
071: import org.dspace.core.Utils;
072: import org.dspace.handle.HandleManager;
073: import org.dspace.app.util.Util;
074:
075: import edu.harvard.hul.ois.mets.Agent;
076: import edu.harvard.hul.ois.mets.AmdSec;
077: import edu.harvard.hul.ois.mets.BinData;
078: import edu.harvard.hul.ois.mets.Checksumtype;
079: import edu.harvard.hul.ois.mets.Div;
080: import edu.harvard.hul.ois.mets.DmdSec;
081: import edu.harvard.hul.ois.mets.FLocat;
082: import edu.harvard.hul.ois.mets.FileGrp;
083: import edu.harvard.hul.ois.mets.FileSec;
084: import edu.harvard.hul.ois.mets.Loctype;
085: import edu.harvard.hul.ois.mets.MdWrap;
086: import edu.harvard.hul.ois.mets.Mdtype;
087: import edu.harvard.hul.ois.mets.Mets;
088: import edu.harvard.hul.ois.mets.MetsHdr;
089: import edu.harvard.hul.ois.mets.Name;
090: import edu.harvard.hul.ois.mets.RightsMD;
091: import edu.harvard.hul.ois.mets.Role;
092: import edu.harvard.hul.ois.mets.StructMap;
093: import edu.harvard.hul.ois.mets.Type;
094: import edu.harvard.hul.ois.mets.XmlData;
095: import edu.harvard.hul.ois.mets.helper.Base64;
096: import edu.harvard.hul.ois.mets.helper.MetsException;
097: import edu.harvard.hul.ois.mets.helper.MetsValidator;
098: import edu.harvard.hul.ois.mets.helper.MetsWriter;
099: import edu.harvard.hul.ois.mets.helper.PCData;
100: import edu.harvard.hul.ois.mets.helper.PreformedXML;
101:
102: /**
103: * Tool for exporting DSpace AIPs with the metadata serialised in METS format
104: *
105: * @author Robert Tansley
106: * @version $Revision: 2174 $
107: */
108: public class METSExport {
109: private static int licenseFormat = -1;
110:
111: private static Properties dcToMODS;
112:
113: public static void main(String[] args) throws Exception {
114: Context context = new Context();
115:
116: init(context);
117:
118: // create an options object and populate it
119: CommandLineParser parser = new PosixParser();
120:
121: Options options = new Options();
122:
123: options.addOption("c", "collection", true,
124: "Handle of collection to export");
125: options
126: .addOption("i", "item", true,
127: "Handle of item to export");
128: options.addOption("a", "all", false,
129: "Export all items in the archive");
130: options.addOption("d", "destination", true,
131: "Destination directory");
132: options.addOption("h", "help", false, "Help");
133:
134: CommandLine line = parser.parse(options, args);
135:
136: if (line.hasOption('h')) {
137: HelpFormatter myhelp = new HelpFormatter();
138: myhelp.printHelp("metsexport", options);
139: System.out
140: .println("\nExport a collection: metsexport -c hdl:123.456/789");
141: System.out
142: .println("Export an item: metsexport -i hdl:123.456/890");
143: System.out.println("Export everything: metsexport -a");
144:
145: System.exit(0);
146: }
147:
148: String dest = "";
149:
150: if (line.hasOption('d')) {
151: dest = line.getOptionValue('d');
152:
153: // Make sure it ends with a file separator
154: if (!dest.endsWith(File.separator)) {
155: dest = dest + File.separator;
156: }
157: }
158:
159: if (line.hasOption('i')) {
160: String handle = getHandleArg(line.getOptionValue('i'));
161:
162: // Exporting a single item
163: DSpaceObject o = HandleManager.resolveToObject(context,
164: handle);
165:
166: if ((o != null) && o instanceof Item) {
167: writeAIP(context, (Item) o, dest);
168: System.exit(0);
169: } else {
170: System.err.println(line.getOptionValue('i')
171: + " is not a valid item Handle");
172: System.exit(1);
173: }
174: }
175:
176: ItemIterator items = null;
177:
178: if (line.hasOption('c')) {
179: String handle = getHandleArg(line.getOptionValue('c'));
180:
181: // Exporting a collection's worth of items
182: DSpaceObject o = HandleManager.resolveToObject(context,
183: handle);
184:
185: if ((o != null) && o instanceof Collection) {
186: items = ((Collection) o).getItems();
187: } else {
188: System.err.println(line.getOptionValue('c')
189: + " is not a valid collection Handle");
190: System.exit(1);
191: }
192: }
193:
194: if (line.hasOption('a')) {
195: items = Item.findAll(context);
196: }
197:
198: if (items == null) {
199: System.err.println("Nothing to export specified!");
200: System.exit(1);
201: }
202:
203: while (items.hasNext()) {
204: writeAIP(context, items.next(), dest);
205: }
206:
207: context.abort();
208: System.exit(0);
209: }
210:
211: /**
212: * Initialise various variables, read in config etc.
213: *
214: * @param context
215: * DSpace context
216: */
217: private static void init(Context context) throws SQLException,
218: IOException {
219: // Don't init again if initialised already
220: if (licenseFormat != -1) {
221: return;
222: }
223:
224: // Find the License format
225: BitstreamFormat bf = BitstreamFormat.findByShortDescription(
226: context, "License");
227: licenseFormat = bf.getID();
228:
229: // get path to DC->MODS map info file
230: String configFile = ConfigurationManager
231: .getProperty("dspace.dir")
232: + File.separator
233: + "config"
234: + File.separator
235: + "dc2mods.cfg";
236:
237: // Read it in
238: InputStream is = new FileInputStream(configFile);
239: dcToMODS = new Properties();
240: dcToMODS.load(is);
241: }
242:
243: /**
244: * Write out the AIP for the given item to the given directory. A new
245: * directory will be created with the Handle (URL-encoded) as the directory
246: * name, and inside, a mets.xml file written, together with the bitstreams.
247: *
248: * @param context
249: * DSpace context to use
250: * @param item
251: * Item to write
252: * @param dest
253: * destination directory
254: */
255: public static void writeAIP(Context context, Item item, String dest)
256: throws SQLException, IOException, AuthorizeException,
257: MetsException {
258: System.out.println("Exporting item hdl:" + item.getHandle());
259:
260: // Create aip directory
261: java.io.File aipDir = new java.io.File(dest
262: + URLEncoder.encode("hdl:" + item.getHandle(), "UTF-8"));
263:
264: if (!aipDir.mkdir()) {
265: // Couldn't make the directory for some reason
266: throw new IOException("Couldn't create "
267: + aipDir.toString());
268: }
269:
270: // Write the METS file
271: FileOutputStream out = new FileOutputStream(aipDir.toString()
272: + java.io.File.separator + "mets.xml");
273: writeMETS(context, item, out, false);
274: out.close();
275:
276: // Write bitstreams
277: Bundle[] bundles = item.getBundles();
278:
279: for (int i = 0; i < bundles.length; i++) {
280: Bitstream[] bitstreams = bundles[i].getBitstreams();
281:
282: for (int b = 0; b < bitstreams.length; b++) {
283: // Skip license bitstream and unauthorized resources
284: if ((bitstreams[b].getFormat().getID() != licenseFormat)
285: && AuthorizeManager.authorizeActionBoolean(
286: context, bitstreams[b], Constants.READ)) {
287: out = new FileOutputStream(aipDir.toString()
288: + java.io.File.separator
289: + bitstreams[b].getName());
290:
291: InputStream in = bitstreams[b].retrieve();
292: Utils.bufferedCopy(in, out);
293: out.close();
294: in.close();
295: }
296: }
297: }
298: }
299:
300: /**
301: * Write METS metadata corresponding to the metadata for an item
302: *
303: * @param context
304: * DSpace context
305: * @param item
306: * DSpace item to create METS object for
307: * @param os
308: * A stream to write METS package to (UTF-8 encoding will be used)
309: * @param fullURL
310: * if <code>true</code>, the <FLocat> values for each
311: * bitstream will be the full URL for that bitstream. Otherwise,
312: * only the filename itself will be used.
313: */
314: public static void writeMETS(Context context, Item item,
315: OutputStream os, boolean fullURL) throws SQLException,
316: IOException, AuthorizeException {
317: try {
318: init(context);
319:
320: // Create the METS file
321: Mets mets = new Mets();
322:
323: // Top-level stuff
324: mets.setOBJID("hdl:" + item.getHandle());
325: mets.setLABEL("DSpace Item");
326: mets
327: .setSchema("mods", "http://www.loc.gov/mods/v3",
328: "http://www.loc.gov/standards/mods/v3/mods-3-0.xsd");
329:
330: // MetsHdr
331: MetsHdr metsHdr = new MetsHdr();
332: metsHdr.setCREATEDATE(new Date()); // FIXME: CREATEDATE is now:
333: // maybe should be item create
334: // date?
335:
336: // Agent
337: Agent agent = new Agent();
338: agent.setROLE(Role.CUSTODIAN);
339: agent.setTYPE(Type.ORGANIZATION);
340:
341: Name name = new Name();
342: name.getContent().add(
343: new PCData(ConfigurationManager
344: .getProperty("dspace.name")));
345: agent.getContent().add(name);
346:
347: metsHdr.getContent().add(agent);
348:
349: mets.getContent().add(metsHdr);
350:
351: DmdSec dmdSec = new DmdSec();
352: dmdSec.setID("DMD_hdl_" + item.getHandle());
353:
354: MdWrap mdWrap = new MdWrap();
355: mdWrap.setMDTYPE(Mdtype.MODS);
356:
357: XmlData xmlData = new XmlData();
358: createMODS(item, xmlData);
359:
360: mdWrap.getContent().add(xmlData);
361: dmdSec.getContent().add(mdWrap);
362: mets.getContent().add(dmdSec);
363:
364: // amdSec
365: AmdSec amdSec = new AmdSec();
366: amdSec.setID("TMD_hdl_" + item.getHandle());
367:
368: // FIXME: techMD here
369: // License as <rightsMD><mdWrap><binData>base64encoded</binData>...
370: InputStream licenseStream = findLicense(context, item);
371:
372: if (licenseStream != null) {
373: RightsMD rightsMD = new RightsMD();
374: MdWrap rightsMDWrap = new MdWrap();
375: rightsMDWrap.setMIMETYPE("text/plain");
376: rightsMDWrap.setMDTYPE(Mdtype.OTHER);
377: rightsMDWrap.setOTHERMDTYPE("TEXT");
378:
379: BinData binData = new BinData();
380: Base64 base64 = new Base64(licenseStream);
381:
382: binData.getContent().add(base64);
383: rightsMDWrap.getContent().add(binData);
384: rightsMD.getContent().add(rightsMDWrap);
385: amdSec.getContent().add(rightsMD);
386: }
387:
388: // FIXME: History data???? Nooooo!!!!
389: mets.getContent().add(amdSec);
390:
391: // fileSec
392: FileSec fileSec = new FileSec();
393: boolean fileSecEmpty = true;
394:
395: Bundle[] bundles = item.getBundles();
396:
397: for (int i = 0; i < bundles.length; i++) {
398: Bitstream[] bitstreams = bundles[i].getBitstreams();
399:
400: // Unusual condition, but if no bitstreams, skip this bundle
401: if (bitstreams.length == 0) {
402: continue;
403: }
404:
405: // First: we skip the license bundle, since it's included
406: // elsewhere
407: if (bitstreams[0].getFormat().getID() == licenseFormat) {
408: continue;
409: }
410:
411: // Create a fileGrp
412: FileGrp fileGrp = new FileGrp();
413:
414: // Bundle name for USE attribute
415: if ((bundles[i].getName() != null)
416: && !bundles[i].getName().equals("")) {
417: fileGrp.setUSE(bundles[i].getName());
418: }
419:
420: for (int bits = 0; bits < bitstreams.length; bits++) {
421: // What's the persistent(-ish) ID?
422: String bitstreamPID = ConfigurationManager
423: .getProperty("dspace.url")
424: + "/bitstream/"
425: + item.getHandle()
426: + "/"
427: + bitstreams[bits].getSequenceID()
428: + "/"
429: + Util.encodeBitstreamName(bitstreams[bits]
430: .getName(), "UTF-8");
431:
432: edu.harvard.hul.ois.mets.File file = new edu.harvard.hul.ois.mets.File();
433:
434: /*
435: * ID: we use the unique part of the persistent ID, i.e. the
436: * Handle + sequence number, but with _'s instead of /'s so
437: * it's a legal xsd:ID.
438: */
439: String xmlIDstart = item.getHandle().replaceAll(
440: "/", "_")
441: + "_";
442:
443: file.setID(xmlIDstart
444: + bitstreams[bits].getSequenceID());
445:
446: String groupID = "GROUP_" + xmlIDstart
447: + bitstreams[bits].getSequenceID();
448:
449: /*
450: * If we're in THUMBNAIL or TEXT bundles, the bitstream is
451: * extracted text or a thumbnail, so we use the name to work
452: * out which bitstream to be in the same group as
453: */
454: if ((bundles[i].getName() != null)
455: && (bundles[i].getName()
456: .equals("THUMBNAIL") || bundles[i]
457: .getName().equals("TEXT"))) {
458: // Try and find the original bitstream, and chuck the
459: // derived
460: // bitstream in the same group
461: Bitstream original = findOriginalBitstream(
462: item, bitstreams[bits]);
463:
464: if (original != null) {
465: groupID = "GROUP_" + xmlIDstart
466: + original.getSequenceID();
467: }
468: }
469:
470: file.setGROUPID(groupID);
471: file.setOWNERID(bitstreamPID);
472:
473: // FIXME: ADMID should point to appropriate TechMD section
474: // above
475: file.setMIMETYPE(bitstreams[bits].getFormat()
476: .getMIMEType());
477:
478: // FIXME: CREATED: no date
479: file.setSIZE(bitstreams[bits].getSize());
480: file.setCHECKSUM(bitstreams[bits].getChecksum());
481: file.setCHECKSUMTYPE(Checksumtype.MD5);
482:
483: // FLocat: filename is as in records, or full URL
484: // FIXME: Duplicate filenames and characters illegal to
485: // local OS may cause problems
486: FLocat flocat = new FLocat();
487: flocat.setLOCTYPE(Loctype.URL);
488: if (fullURL) {
489: flocat.setXlinkHref(bitstreamPID);
490: } else {
491: flocat.setXlinkHref(bitstreams[bits].getName());
492: }
493:
494: // Add FLocat to File, and File to FileGrp
495: file.getContent().add(flocat);
496: fileGrp.getContent().add(file);
497: }
498:
499: // Add fileGrp to fileSec
500: fileSec.getContent().add(fileGrp);
501: fileSecEmpty = false;
502: }
503:
504: // Add fileSec to document
505: if (!fileSecEmpty) {
506: mets.getContent().add(fileSec);
507: }
508:
509: // FIXME: Add Structmap here, but it is empty and we won't use it now.
510: StructMap structMap = new StructMap();
511: Div div = new Div();
512: structMap.getContent().add(div);
513: mets.getContent().add(structMap);
514:
515: mets.validate(new MetsValidator());
516:
517: mets.write(new MetsWriter(os));
518: } catch (MetsException e) {
519: // We don't pass up a MetsException, so callers don't need to
520: // know the details of the METS toolkit
521: e.printStackTrace();
522: throw new IOException(e.getMessage());
523: }
524: }
525:
526: /**
527: * Utility to find the license bitstream from an item
528: *
529: * @param context
530: * DSpace context
531: * @param item
532: * the item
533: * @return the license as a string
534: *
535: * @throws IOException
536: * if the license bitstream can't be read
537: */
538: private static InputStream findLicense(Context context, Item item)
539: throws SQLException, IOException, AuthorizeException {
540: Bundle[] bundles = item.getBundles();
541:
542: for (int i = 0; i < bundles.length; i++) {
543: // Assume license will be in its own bundle
544: Bitstream[] bitstreams = bundles[i].getBitstreams();
545:
546: if (bitstreams.length > 0) {
547: if (bitstreams[0].getFormat().getID() == licenseFormat) {
548: // Read the license into a string
549: return bitstreams[0].retrieve();
550: }
551: }
552: }
553:
554: // Oops! No license!
555: return null;
556: }
557:
558: /**
559: * For a bitstream that's a thumbnail or extracted text, find the
560: * corresponding bitstream in the ORIGINAL bundle
561: *
562: * @param item
563: * the item we're dealing with
564: * @param derived
565: * the derived bitstream
566: *
567: * @return the corresponding original bitstream (or null)
568: */
569: private static Bitstream findOriginalBitstream(Item item,
570: Bitstream derived) throws SQLException {
571: Bundle[] bundles = item.getBundles();
572:
573: // Filename of original will be filename of the derived bitstream
574: // minus the extension (last 4 chars - .jpg or .txt)
575: String originalFilename = derived.getName().substring(0,
576: derived.getName().length() - 4);
577:
578: // First find "original" bundle
579: for (int i = 0; i < bundles.length; i++) {
580: if ((bundles[i].getName() != null)
581: && bundles[i].getName().equals("ORIGINAL")) {
582: // Now find the corresponding bitstream
583: Bitstream[] bitstreams = bundles[i].getBitstreams();
584:
585: for (int bsnum = 0; bsnum < bitstreams.length; bsnum++) {
586: if (bitstreams[bsnum].getName().equals(
587: originalFilename)) {
588: return bitstreams[bsnum];
589: }
590: }
591: }
592: }
593:
594: // Didn't find it
595: return null;
596: }
597:
598: /**
599: * Create MODS metadata from the DC in the item, and add to the given
600: * XmlData METS object.
601: *
602: * @param item
603: * the item
604: * @param xmlData
605: * xmlData to add MODS to.
606: */
607: private static void createMODS(Item item, XmlData xmlData) {
608: DCValue[] dc = item.getDC(Item.ANY, Item.ANY, Item.ANY);
609:
610: StringBuffer modsXML = new StringBuffer();
611:
612: for (int i = 0; i < dc.length; i++) {
613: // Get the property name - element[.qualifier]
614: String propName = ((dc[i].qualifier == null) ? dc[i].element
615: : (dc[i].element + "." + dc[i].qualifier));
616:
617: String modsMapping = dcToMODS.getProperty(propName);
618:
619: if (modsMapping == null) {
620: System.err.println("WARNING: No MODS mapping for "
621: + propName);
622: } else {
623: String value = dc[i].value;
624:
625: // Replace all $'s with \$ so it doesn't trip up the replaceAll!
626: if (value != null && value.length() > 0) {
627: // RegExp note: Yes, there really does need to be this many backslashes!
628: // To have \$ inserted in the replacement, both the backslash and the dollar
629: // have to be escaped (backslash) - so the replacemenet string has to be
630: // passed as \\\$. All of those backslashes then have to escaped in the literal
631: // for them to be in string used!!!
632: value = dc[i].value.replaceAll("\\$", "\\\\\\$");
633: }
634:
635: // Replace '%s' with DC value (with entities encoded)
636: modsXML.append(modsMapping.replaceAll("%s", Utils
637: .addEntities(value)));
638: modsXML.append("\n"); // For readability
639: }
640: }
641:
642: PreformedXML pXML = new PreformedXML(modsXML.toString());
643: xmlData.getContent().add(pXML);
644: }
645:
646: /**
647: * Get the handle from the command line in the form 123.456/789. Doesn't
648: * matter if incoming handle has 'hdl:' or 'http://hdl....' before it.
649: *
650: * @param original
651: * Handle as passed in by user
652: * @return Handle as can be looked up in our table
653: */
654: private static String getHandleArg(String original) {
655: if (original.startsWith("hdl:")) {
656: return original.substring(4);
657: }
658:
659: if (original.startsWith("http://hdl.handle.net/")) {
660: return original.substring(22);
661: }
662:
663: return original;
664: }
665: }
|