001: /**
002: * Copyright (c) 2003-2005, www.pdfbox.org
003: * All rights reserved.
004: *
005: * Redistribution and use in source and binary forms, with or without
006: * modification, are permitted provided that the following conditions are met:
007: *
008: * 1. Redistributions of source code must retain the above copyright notice,
009: * this list of conditions and the following disclaimer.
010: * 2. Redistributions in binary form must reproduce the above copyright notice,
011: * this list of conditions and the following disclaimer in the documentation
012: * and/or other materials provided with the distribution.
013: * 3. Neither the name of pdfbox; nor the names of its
014: * contributors may be used to endorse or promote products derived from this
015: * software without specific prior written permission.
016: *
017: * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
018: * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
019: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
020: * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
021: * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
022: * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023: * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
024: * ANY THEORY OF LIABILIT, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
025: * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
026: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
027: *
028: * http://www.pdfbox.org
029: */package test.pdfbox.util;
030:
031: import java.io.File;
032: import java.io.FileInputStream;
033: import java.io.FilenameFilter;
034: import java.io.FileOutputStream;
035: import java.io.IOException;
036: import java.io.InputStreamReader;
037: import java.io.LineNumberReader;
038: import java.io.OutputStream;
039: import java.io.OutputStreamWriter;
040: import java.io.Writer;
041:
042: import junit.framework.Test;
043: import junit.framework.TestCase;
044: import junit.framework.TestSuite;
045:
046: import org.pdfbox.pdmodel.PDDocument;
047:
048: import org.pdfbox.util.PDFTextStripper;
049:
050: /**
051: * Test suite for PDFTextStripper.
052: *
053: * FILE SET VALIDATION
054: *
055: * This test suite is designed to test PDFTextStripper using a set of PDF
056: * files and known good output for each. The default mode of testAll()
057: * is to process each *.pdf file in "test/input". An output file is
058: * created in "test/output" with the same name as the PDF file, plus an
059: * additional ".txt" suffix.
060: *
061: * The output file is then tested against a known good result file from
062: * the input directory (again, with the same name as the tested PDF file,
063: * but with the additional ".txt" suffix).
064: *
065: * So for the file "test/input/hello.pdf", an output file will be generated
066: * named "test/output/hello.pdf.txt". Then that file will be compared to
067: * the known good file "test/input/hello.pdf.txt", if it exists.
068: *
069: * Any errors are logged, and at the end of processing all *.pdf files, if
070: * there were any errors, the test fails. The logging is at INFO, as the
071: * general goal is overall validation, and on failure, the indication of
072: * which file or files failed.
073: *
074: * When processing new PDF files, you may use testAll() to generate output,
075: * verify the output manually, then move the output file to the test input
076: * directory to use as the basis for future validations.
077: *
078: * SINGLE FILE VALIDATION
079: *
080: * To further research individual failures, the test.pdfbox.util.TextStripper.file
081: * system property may be set with the name of a single file in the "test/input"
082: * directory. In this mode, testAll() will evaluate only that file, and will
083: * do so with DEBUG level logging. You can set this property from ant by
084: * defining "file", as in:
085: *
086: * ant testextract -Dfile=hello.pdf
087: *
088: * @author Robert Dickinson (bob@brutesquadlabs.com)
089: * @author <a href="mailto:ben@benlitchfield.com">Ben Litchfield</a>
090: * @version $Revision: 1.17 $
091: */
092: public class TestTextStripper extends TestCase {
093: private boolean bFail = false;
094: private PDFTextStripper stripper = null;
095:
096: /**
097: * Test class constructor.
098: *
099: * @param name The name of the test class.
100: *
101: * @throws IOException If there is an error creating the test.
102: */
103: public TestTextStripper(String name) throws IOException {
104: super (name);
105: stripper = new PDFTextStripper();
106: stripper.setLineSeparator("\n");
107: }
108:
109: /**
110: * Test suite setup.
111: */
112: public void setUp() {
113: // If you want to test a single file using DEBUG logging, from an IDE,
114: // you can do something like this:
115: //
116: // System.setProperty("test.pdfbox.util.TextStripper.file", "FVS318Ref.pdf");
117: }
118:
119: /**
120: * Determine whether two strings are equal, where two null strings are
121: * considered equal.
122: *
123: * @param expected Excpected string
124: * @param actual Actual String
125: * @return <code>true</code> is the strings are both null,
126: * or if their contents are the same, otherwise <code>false</code>.
127: */
128: private boolean stringsEqual(String expected, String actual) {
129: boolean equals = true;
130: if ((expected == null) && (actual == null)) {
131: return true;
132: } else if (expected != null && actual != null) {
133: expected = expected.trim();
134: actual = actual.trim();
135: char[] expectedArray = expected.toCharArray();
136: char[] actualArray = actual.toCharArray();
137: int expectedIndex = 0;
138: int actualIndex = 0;
139: while (expectedIndex < expectedArray.length
140: && actualIndex < actualArray.length) {
141: if (expectedArray[expectedIndex] != actualArray[actualIndex]) {
142: equals = false;
143: System.err.println("Lines differ at index"
144: + " expected:" + expectedIndex + "-"
145: + (int) expectedArray[expectedIndex]
146: + " actual:" + actualIndex + "-"
147: + (int) actualArray[actualIndex]);
148: break;
149: }
150: expectedIndex = skipWhitespace(expectedArray,
151: expectedIndex);
152: actualIndex = skipWhitespace(actualArray, actualIndex);
153: expectedIndex++;
154: actualIndex++;
155: }
156: if (equals) {
157: if (expectedIndex != expectedArray.length) {
158: equals = false;
159: System.err.println("Expected line is longer at:"
160: + expectedIndex);
161: }
162: if (actualIndex != actualArray.length) {
163: equals = false;
164: System.err.println("Actual line is longer at:"
165: + actualIndex);
166: }
167: }
168: } else if ((expected == null && actual != null && actual.trim()
169: .equals(""))
170: || (actual == null && expected != null && expected
171: .trim().equals(""))) {
172: //basically there are some cases where pdfbox will put an extra line
173: //at the end of the file, who cares, this is not enough to report
174: // a failure
175: equals = true;
176: } else {
177: equals = false;
178: }
179: return equals;
180: }
181:
182: /**
183: * If the current index is whitespace then skip any subsequent whitespace.
184: */
185: private int skipWhitespace(char[] array, int index) {
186: //if we are at a space character then skip all space
187: //characters, but when all done rollback 1 because stringsEqual
188: //will roll forward 1
189: if (array[index] == ' ' || array[index] > 256) {
190: while (index < array.length
191: && (array[index] == ' ' || array[index] > 256)) {
192: index++;
193: }
194: index--;
195: }
196: return index;
197: }
198:
199: /**
200: * Validate text extraction on a single file.
201: *
202: * @param file The file to validate
203: * @param bLogResult Whether to log the extracted text
204: * @throws Exception when there is an exception
205: */
206: public void doTestFile(File file, boolean bLogResult)
207: throws Exception {
208: System.out.println("Preparing to parse " + file.getName());
209:
210: OutputStream os = null;
211: Writer writer = null;
212: PDDocument document = null;
213: try {
214: document = PDDocument.load(file);
215:
216: File outFile = new File(file.getParentFile()
217: .getParentFile(), "output/" + file.getName()
218: + ".txt");
219: os = new FileOutputStream(outFile);
220: os.write(0xFF);
221: os.write(0xFE);
222: writer = new OutputStreamWriter(os, "UTF-16LE");
223:
224: stripper.writeText(document, writer);
225:
226: if (bLogResult) {
227: System.out.println("Text for " + file.getName()
228: + ":\r\n" + stripper.getText(document));
229: }
230:
231: File expectedFile = new File(file.getParentFile()
232: .getParentFile(), "input/" + file.getName()
233: + ".txt");
234: File actualFile = new File(file.getParentFile()
235: .getParentFile(), "output/" + file.getName()
236: + ".txt");
237:
238: if (!expectedFile.exists()) {
239: this .bFail = true;
240: System.err.println("FAILURE: Input verification file: "
241: + expectedFile.getAbsolutePath()
242: + " did not exist");
243: return;
244: }
245:
246: LineNumberReader expectedReader = new LineNumberReader(
247: new InputStreamReader(new FileInputStream(
248: expectedFile), "UTF-16"));
249: LineNumberReader actualReader = new LineNumberReader(
250: new InputStreamReader(new FileInputStream(
251: actualFile), "UTF-16"));
252:
253: while (true) {
254: String expectedLine = expectedReader.readLine();
255: while (expectedLine != null
256: && expectedLine.trim().length() == 0) {
257: expectedLine = expectedReader.readLine();
258: }
259: String actualLine = actualReader.readLine();
260: while (actualLine != null
261: && actualLine.trim().length() == 0) {
262: actualLine = actualReader.readLine();
263: }
264: if (!stringsEqual(expectedLine, actualLine)) {
265: this .bFail = true;
266: System.err
267: .println("FAILURE: Line mismatch for file "
268: + file.getName()
269: + " at expected line: "
270: + expectedReader.getLineNumber()
271: + " at actual line: "
272: + actualReader.getLineNumber()
273: + "\r\n expected line was: \""
274: + expectedLine + "\""
275: + "\r\n actual line was: \""
276: + actualLine + "\"");
277: //lets report all lines, even though this might produce some verbose logging
278: //break;
279: }
280:
281: if (expectedLine == null || actualLine == null) {
282: break;
283: }
284: }
285: } finally {
286: if (writer != null) {
287: writer.close();
288: }
289: if (os != null) {
290: os.close();
291: }
292: if (document != null) {
293: document.close();
294: }
295: }
296: }
297:
298: /**
299: * Test to validate text extraction of file set.
300: *
301: * @throws Exception when there is an exception
302: */
303: public void testExtract() throws Exception {
304: String filename = System
305: .getProperty("test.pdfbox.util.TextStripper.file");
306: File testDir = new File("test/input");
307:
308: if ((filename == null) || (filename.length() == 0)) {
309: File[] testFiles = testDir.listFiles(new FilenameFilter() {
310: public boolean accept(File dir, String name) {
311: return (name.endsWith(".pdf"));
312: }
313: });
314:
315: for (int n = 0; n < testFiles.length; n++) {
316: doTestFile(testFiles[n], false);
317: }
318: } else {
319: doTestFile(new File(testDir, filename), true);
320: }
321:
322: if (this .bFail) {
323: fail("One or more failures, see test log for details");
324: }
325: }
326:
327: /**
328: * Set the tests in the suite for this test class.
329: *
330: * @return the Suite.
331: */
332: public static Test suite() {
333: return new TestSuite(TestTextStripper.class);
334: }
335:
336: /**
337: * Command line execution.
338: *
339: * @param args Command line arguments.
340: */
341: public static void main(String[] args) {
342: String[] arg = { TestTextStripper.class.getName() };
343: junit.textui.TestRunner.main(arg);
344: }
345: }
|