001: /**
002: * Copyright (c) 2005, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: *
030: */package org.pdfbox.examples.util;
031:
032: import org.pdfbox.exceptions.InvalidPasswordException;
033:
034: import org.pdfbox.pdmodel.PDDocument;
035: import org.pdfbox.pdmodel.PDPage;
036: import org.pdfbox.util.PDFTextStripperByArea;
037:
038: import java.awt.Rectangle;
039:
040: import java.util.List;
041:
042: /**
043: * This is an example on how to extract text from a specific area on the PDF document.
044: *
045: * Usage: java org.pdfbox.examples.util.ExtractTextByArea <input-pdf>
046: *
047: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
048: * @version $Revision: 1.2 $
049: */
050: public class ExtractTextByArea {
051: private ExtractTextByArea() {
052: //utility class and should not be constructed.
053: }
054:
055: /**
056: * This will print the documents text in a certain area.
057: *
058: * @param args The command line arguments.
059: *
060: * @throws Exception If there is an error parsing the document.
061: */
062: public static void main(String[] args) throws Exception {
063: if (args.length != 1) {
064: usage();
065: } else {
066: PDDocument document = null;
067: try {
068: document = PDDocument.load(args[0]);
069: if (document.isEncrypted()) {
070: try {
071: document.decrypt("");
072: } catch (InvalidPasswordException e) {
073: System.err
074: .println("Error: Document is encrypted with a password.");
075: System.exit(1);
076: }
077: }
078: PDFTextStripperByArea stripper = new PDFTextStripperByArea();
079: stripper.setSortByPosition(true);
080: Rectangle rect = new Rectangle(10, 280, 275, 60);
081: stripper.addRegion("class1", rect);
082: List allPages = document.getDocumentCatalog()
083: .getAllPages();
084: PDPage firstPage = (PDPage) allPages.get(0);
085: stripper.extractRegions(firstPage);
086: System.out.println("Text in the area:" + rect);
087: System.out.println(stripper.getTextForRegion("class1"));
088:
089: } finally {
090: if (document != null) {
091: document.close();
092: }
093: }
094: }
095: }
096:
097: /**
098: * This will print the usage for this document.
099: */
100: private static void usage() {
101: System.err
102: .println("Usage: java org.pdfbox.examples.util.ExtractTextByArea <input-pdf>");
103: }
104:
105: }
|