Source Code Cross Referenced for PdfBoxPDFPage.java in  » Testing » webtest » com » canoo » webtest » plugins » pdftest » htmlunit » pdfbox » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Testing » webtest » com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        package com.canoo.webtest.plugins.pdftest.htmlunit.pdfbox;
002:
003:        import java.awt.geom.Rectangle2D;
004:        import java.io.IOException;
005:        import java.io.StringWriter;
006:        import java.util.ArrayList;
007:        import java.util.HashMap;
008:        import java.util.Iterator;
009:        import java.util.List;
010:        import java.util.ListIterator;
011:        import java.util.Map;
012:        import java.util.TreeMap;
013:
014:        import org.apache.commons.collections.Predicate;
015:        import org.apache.commons.collections.functors.AndPredicate;
016:        import org.apache.commons.collections.functors.TruePredicate;
017:        import org.apache.commons.io.IOUtils;
018:        import org.apache.commons.logging.Log;
019:        import org.apache.commons.logging.LogFactory;
020:        import org.pdfbox.cos.COSBase;
021:        import org.pdfbox.cos.COSBoolean;
022:        import org.pdfbox.cos.COSDictionary;
023:        import org.pdfbox.cos.COSFloat;
024:        import org.pdfbox.cos.COSInteger;
025:        import org.pdfbox.cos.COSName;
026:        import org.pdfbox.cos.COSNull;
027:        import org.pdfbox.cos.COSString;
028:        import org.pdfbox.exceptions.InvalidPasswordException;
029:        import org.pdfbox.pdmodel.PDDocument;
030:        import org.pdfbox.pdmodel.PDPage;
031:        import org.pdfbox.pdmodel.common.PDRectangle;
032:        import org.pdfbox.pdmodel.encryption.PDStandardEncryption;
033:        import org.pdfbox.pdmodel.font.PDFont;
034:        import org.pdfbox.pdmodel.interactive.action.type.PDAction;
035:        import org.pdfbox.pdmodel.interactive.action.type.PDActionGoTo;
036:        import org.pdfbox.pdmodel.interactive.action.type.PDActionURI;
037:        import org.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
038:        import org.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
039:        import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
040:        import org.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
041:        import org.pdfbox.pdmodel.interactive.form.PDAcroForm;
042:        import org.pdfbox.pdmodel.interactive.form.PDField;
043:        import org.pdfbox.util.PDFTextStripper;
044:        import org.pdfbox.util.PDFTextStripperByArea;
045:        import org.pdfbox.util.TextPosition;
046:
047:        import com.canoo.webtest.plugins.pdftest.htmlunit.PDFEncryptionPermission;
048:        import com.canoo.webtest.plugins.pdftest.htmlunit.PDFField;
049:        import com.canoo.webtest.plugins.pdftest.htmlunit.PDFInvalidPasswordException;
050:        import com.canoo.webtest.plugins.pdftest.htmlunit.PDFPage;
051:        import com.gargoylesoftware.htmlunit.WebResponse;
052:        import com.gargoylesoftware.htmlunit.WebWindow;
053:
054:        /**
055:         * Implementation of {@link PDFPage} based on <a href="http://www.pdfbox.org/">PDFBox</a>.
056:         * @author Etienne Studer
057:         * @author Paul King
058:         * @author Marc Guillemot
059:         */
060:        public class PdfBoxPDFPage implements  PDFPage {
061:            private PDDocument pdfDocument_;
062:            private List bookmarks_;
063:            private final WebWindow webWindow_;
064:            private final WebResponse webResponse_;
065:
066:            private static final COSName INFO_PROPERTY_TITLE = COSName
067:                    .getPDFName("Title"); // title of document
068:
069:            private boolean cleanUpCalled;
070:            private static int counter = 0;
071:            private static int allocated = 0;
072:
073:            public void cleanUp() throws IOException {
074:                cleanUpCalled = true;
075:                allocated--;
076:                if (pdfDocument_ != null)
077:                    pdfDocument_.close();
078:            }
079:
080:            public PdfBoxPDFPage(final WebResponse webResponse,
081:                    final WebWindow webWindow) {
082:                webWindow_ = webWindow;
083:                webResponse_ = webResponse;
084:
085:                pdfDocument_ = loadPDFDocument();
086:                counter++;
087:                allocated++;
088:            }
089:
090:            protected PDDocument loadPDFDocument() {
091:                try {
092:                    return PDDocument.load(getWebResponse()
093:                            .getContentAsStream());
094:                } catch (final IOException e) {
095:                    getLog().warn(
096:                            "Failed parsing PDF document "
097:                                    + getWebResponse().getUrl() + ": "
098:                                    + e.getMessage(), e);
099:                }
100:
101:                return null;
102:            }
103:
104:            /**
105:             * Return the log object for this web client
106:             * @return The log object
107:             */
108:            protected final Log getLog() {
109:                return LogFactory.getLog(getClass());
110:            }
111:
112:            private COSDictionary getInfoDictionary() {
113:                final COSDictionary encryptProperties = getPDFDocument()
114:                        .getDocumentInformation().getDictionary();
115:                return encryptProperties != null ? encryptProperties
116:                        : new COSDictionary();
117:            }
118:
119:            private static void assertKeyExists(COSName key,
120:                    COSDictionary properties) {
121:                if (properties.keyList().contains(key)) {
122:                    return;
123:                }
124:
125:                throw new IllegalArgumentException("Specified property key '"
126:                        + key.getName() + "' does not exist.");
127:            }
128:
129:            public String getDocumentTitle() {
130:                assertKeyExists(INFO_PROPERTY_TITLE, getInfoDictionary());
131:                COSString title = (COSString) getInfoDictionary().getItem(
132:                        INFO_PROPERTY_TITLE);
133:                return title.getString();
134:            }
135:
136:            public WebWindow getEnclosingWindow() {
137:                return webWindow_;
138:            }
139:
140:            public WebResponse getWebResponse() {
141:                return webResponse_;
142:            }
143:
144:            public void initialize() throws IOException {
145:                // TODO Auto-generated method stub
146:
147:            }
148:
149:            public int getNumberOfPages() {
150:                return getPDFDocument().getNumberOfPages();
151:            }
152:
153:            /**
154:             * Gets the PDF document
155:             * @return the document
156:             * @throws RuntimeException if the PDF document couldn't be parsed
157:             */
158:            protected PDDocument getPDFDocument() {
159:                if (cleanUpCalled) {
160:                    pdfDocument_ = loadPDFDocument();
161:                    cleanUpCalled = false;
162:                }
163:                if (pdfDocument_ == null)
164:                    throw new RuntimeException(
165:                            "Can't work on pdf document as it couldn't get parsed");
166:                return pdfDocument_;
167:            }
168:
169:            public List getFields() {
170:                return getFields(TruePredicate.INSTANCE);
171:            }
172:
173:            public void decrypt(String password) {
174:                try {
175:                    getPDFDocument().decrypt(password);
176:                } catch (final InvalidPasswordException e) {
177:                    throw new PDFInvalidPasswordException(e);
178:                } catch (final Exception e) {
179:                    throw new RuntimeException(
180:                            "Problem decrypting the document", e);
181:                }
182:            }
183:
184:            public boolean isEncrypted() {
185:                return getPDFDocument().isEncrypted();
186:            }
187:
188:            public String getText(int startPage, int endPage) {
189:                return getTextInternal(startPage, endPage);
190:            }
191:
192:            protected String getTextInternal(int startPage, int endPage) {
193:                try {
194:                    final PDFTextStripper textStripper = new PDFTextStripper();
195:                    textStripper.setStartPage(startPage);
196:                    textStripper.setEndPage(endPage);
197:                    return textStripper.getText(getPDFDocument());
198:                } catch (final IOException e) {
199:                    throw new RuntimeException("Problem extracting text", e);
200:                }
201:            }
202:
203:            protected List getFields(final Predicate filter) {
204:                final PDAcroForm acroForm = getPDFDocument()
205:                        .getDocumentCatalog().getAcroForm();
206:                final List response = new ArrayList();
207:
208:                try {
209:                    if (acroForm != null) {
210:                        final List fields = acroForm.getFields();
211:                        for (final Iterator iter = fields.iterator(); iter
212:                                .hasNext();) {
213:                            final PDField field = (PDField) iter.next();
214:                            final List kids = field.getKids();
215:                            if (kids != null && !kids.isEmpty()) {
216:                                for (final Iterator iterKids = kids.iterator(); iterKids
217:                                        .hasNext();) {
218:                                    final PDField childField = (PDField) iterKids
219:                                            .next();
220:                                    if (filter.evaluate(childField)) {
221:                                        response.add(PdfBoxPDFField
222:                                                .wrap(childField));
223:                                    }
224:                                }
225:                            } else if (filter.evaluate(field)) {
226:                                response.add(PdfBoxPDFField.wrap(field));
227:                            }
228:                        }
229:                    }
230:                } catch (final IOException e) {
231:                    throw new RuntimeException("Failed reading fields", e);
232:                }
233:
234:                return response;
235:            }
236:
237:            public List getFields(final String name, final PDFField.Type type) {
238:                return getFields(PdfBoxPDFField.FieldPredicate
239:                        .buildNamePredicate(name));
240:            }
241:
242:            public List getFields(final String name, final int pageNumber,
243:                    final PDFField.Type type) {
244:                final Predicate predicateName = PdfBoxPDFField.FieldPredicate
245:                        .buildNamePredicate(name);
246:                final Predicate predicatePage = PdfBoxPDFField.FieldPredicate
247:                        .buildPageNumberPredicate(pageNumber);
248:
249:                final Predicate predicate = new AndPredicate(predicateName,
250:                        predicatePage);
251:
252:                return getFields(predicate);
253:            }
254:
255:            public boolean hasPermission(
256:                    final PDFEncryptionPermission permission) {
257:                final PDStandardEncryption info;
258:                try {
259:                    info = (PDStandardEncryption) getPDFDocument()
260:                            .getEncryptionDictionary();
261:                } catch (final IOException e) {
262:                    throw new RuntimeException("Can't read permissions", e);
263:                }
264:
265:                if (PDFEncryptionPermission.ASSEMBLY.equals(permission))
266:                    return info.canAssembleDocument();
267:                else if (PDFEncryptionPermission.COPY.equals(permission))
268:                    return info.canExtractContent();
269:                else if (PDFEncryptionPermission.DEGRADED_PRINTING
270:                        .equals(permission))
271:                    return info.canPrintDegraded();
272:                else if (PDFEncryptionPermission.FILL_IN.equals(permission))
273:                    return info.canFillInForm();
274:                else if (PDFEncryptionPermission.MODIFY_ANNOTATIONS
275:                        .equals(permission))
276:                    return info.canModifyAnnotations();
277:                else if (PDFEncryptionPermission.MODIFY_CONTENTS
278:                        .equals(permission))
279:                    return info.canModify();
280:                else if (PDFEncryptionPermission.PRINTING.equals(permission))
281:                    return info.canPrint();
282:                else if (PDFEncryptionPermission.SCREEN_READERS
283:                        .equals(permission))
284:                    return info.canExtractForAccessibility();
285:
286:                throw new IllegalArgumentException("Unknown pdf permission: "
287:                        + permission);
288:            }
289:
290:            public String getEncryptProperty(final String key) {
291:                final COSDictionary encryptProperties = getPDFDocument()
292:                        .getDocument().getEncryptionDictionary();
293:                return stringValue(encryptProperties.getDictionaryObject(key));
294:            }
295:
296:            static String stringValue(final COSBase element) {
297:                if (element == null) {
298:                    return null;
299:                } else if (element instanceof  COSString) {
300:                    return ((COSString) element).getString();
301:                } else if (element instanceof  COSName) {
302:                    return ((COSName) element).getName();
303:                } else if (element instanceof  COSBoolean) {
304:                    return String.valueOf(((COSBoolean) element).getValue());
305:                } else if (element instanceof  COSInteger) {
306:                    return String.valueOf(((COSInteger) element).intValue());
307:                } else if (element instanceof  COSFloat) {
308:                    return String.valueOf(((COSFloat) element).floatValue());
309:                } else if (element instanceof  COSNull) {
310:                    return "null";
311:                } else
312:                    return String.valueOf(element);
313:            }
314:
315:            public int getEncryptionStrength() {
316:                try {
317:                    return getPDFDocument().getEncryptionDictionary()
318:                            .getLength();
319:                } catch (final IOException e) {
320:                    throw new RuntimeException(
321:                            "Failed reading encryption strength", e);
322:                }
323:            }
324:
325:            public String getInfoProperty(final String key) {
326:                final COSDictionary properties = getPDFDocument()
327:                        .getDocumentInformation().getDictionary();
328:                if (properties == null)
329:                    return null;
330:
331:                final COSName pdfName = COSName.getPDFName(key);
332:                return stringValue(properties.getDictionaryObject(pdfName));
333:            }
334:
335:            public boolean isUserPassword(final String password) {
336:                try {
337:                    return getPDFDocument().isUserPassword(password);
338:                } catch (final Exception e) {
339:                    throw new RuntimeException(
340:                            "Failed verifying user password", e);
341:                }
342:            }
343:
344:            public boolean isOwnerPassword(final String password) {
345:                try {
346:                    return getPDFDocument().isOwnerPassword(password);
347:                } catch (final Exception e) {
348:                    throw new RuntimeException(
349:                            "Failed verifying owner password", e);
350:                }
351:            }
352:
353:            public List getBookmarks() {
354:                if (bookmarks_ == null)
355:                    bookmarks_ = extractBookmarks();
356:
357:                return bookmarks_;
358:            }
359:
360:            private List extractBookmarks() {
361:                final PDDocumentOutline outline = getPDFDocument()
362:                        .getDocumentCatalog().getDocumentOutline();
363:                final List result = new ArrayList();
364:                if (outline != null) {
365:                    PDOutlineItem child = outline.getFirstChild();
366:                    while (child != null) {
367:                        final PdfBoxPDFBookmark topBookmark = new PdfBoxPDFBookmark(
368:                                child, null);
369:                        result.add(topBookmark);
370:                        result.addAll(topBookmark.getAllChildren());
371:                        child = child.getNextSibling();
372:                    }
373:                }
374:                return result;
375:            }
376:
377:            public List getFonts() {
378:                final List fonts = new ArrayList();
379:                final List pages = getPDFDocument().getDocumentCatalog()
380:                        .getAllPages();
381:                for (final ListIterator iter = pages.listIterator(); iter
382:                        .hasNext();) {
383:                    final PDPage page = (PDPage) iter.next();
384:                    try {
385:                        for (final Iterator fontIterator = page.findResources()
386:                                .getFonts().values().iterator(); fontIterator
387:                                .hasNext();) {
388:                            final PDFont font = (PDFont) fontIterator.next();
389:                            fonts
390:                                    .add(new PDFBoxPDFFont(font, iter
391:                                            .nextIndex())); // nextIndex() because page number start with 1 not 0
392:                        }
393:                    } catch (final IOException e) {
394:                        throw new RuntimeException(
395:                                "Failed retrieving the fonts on page "
396:                                        + iter.nextIndex(), e);
397:                    }
398:                }
399:                return fonts;
400:            }
401:
402:            public List getFields(int pageNumber) {
403:                final Predicate predicatePage = PdfBoxPDFField.FieldPredicate
404:                        .buildPageNumberPredicate(pageNumber);
405:                return getFields(predicatePage);
406:            }
407:
408:            public List getFields(final String name, final int pageNumber) {
409:                final Predicate predicateName = PdfBoxPDFField.FieldPredicate
410:                        .buildNamePredicate(name);
411:                final Predicate predicatePage = PdfBoxPDFField.FieldPredicate
412:                        .buildPageNumberPredicate(pageNumber);
413:
414:                final Predicate predicate = new AndPredicate(predicateName,
415:                        predicatePage);
416:
417:                return getFields(predicate);
418:            }
419:
420:            public List getFields(final String name) {
421:                return getFields(PdfBoxPDFField.FieldPredicate
422:                        .buildNamePredicate(name));
423:            }
424:
425:            /**
426:             * Gets the links from the document
427:             * @return the links
428:             */
429:            public List getLinks() {
430:                final List result = new ArrayList();
431:                final List allPages = getPDFDocument().getDocumentCatalog()
432:                        .getAllPages();
433:                for (final ListIterator iter = allPages.listIterator(); iter
434:                        .hasNext();) {
435:                    final PDPage page = (PDPage) iter.next();
436:                    processPage(result, page, iter.nextIndex());
437:                }
438:                return result;
439:            }
440:
441:            private static void processPage(final List result,
442:                    final PDPage page, final int pageNum) {
443:                try {
444:                    final PDFTextStripperByArea stripper = new PDFTextStripperByArea();
445:                    final List linkAnnotations = new ArrayList();
446:                    final List linkRegions = new ArrayList();
447:                    extractAnnotations(page, stripper, linkAnnotations,
448:                            linkRegions);
449:                    stripper.extractRegions(page);
450:                    final Map uriMap = new HashMap();
451:                    final Map textMap = new HashMap();
452:                    collateLinks(linkAnnotations, linkRegions, uriMap, textMap,
453:                            stripper);
454:                    final Iterator it = uriMap.keySet().iterator();
455:                    while (it.hasNext()) {
456:                        final Object key = it.next();
457:                        result.add(new PDFBoxPDFLink((String) textMap.get(key),
458:                                (String) uriMap.get(key), pageNum));
459:                    }
460:                } catch (final IOException e) {
461:                    // ignore
462:                }
463:            }
464:
465:            private static void collateLinks(final List linkAnnotations,
466:                    final List linkRegions, final Map uriMap,
467:                    final Map textMap, final PDFTextStripperByArea stripper)
468:                    throws IOException {
469:                for (int j = 0; j < linkAnnotations.size(); j++) {
470:                    final PDAnnotationLink link = (PDAnnotationLink) linkAnnotations
471:                            .get(j);
472:                    final PDAction action = link.getAction();
473:                    final String urlText = stripper.getTextForRegion(Integer
474:                            .toString(j));
475:                    if (action instanceof  PDActionURI) {
476:                        final PDActionURI uri = (PDActionURI) action;
477:                        // internal links have no text
478:                        if (urlText.length() > 0) {
479:                            textMap.put(linkRegions.get(j), urlText);
480:                        }
481:                        uriMap.put(linkRegions.get(j), uri.getURI());
482:                    } else if (action instanceof  PDActionGoTo) {
483:                        // internal link text associated with goto
484:                        if (urlText.length() > 0) {
485:                            textMap.put(linkRegions.get(j), urlText);
486:                        }
487:                    }
488:                }
489:            }
490:
491:            private static List extractAnnotations(final PDPage page,
492:                    final PDFTextStripperByArea stripper,
493:                    final List linkAnnotations, final List linkRegions)
494:                    throws IOException {
495:                final List annotations = page.getAnnotations();
496:                for (int j = 0; j < annotations.size(); j++) {
497:                    final PDAnnotation annot = (PDAnnotation) annotations
498:                            .get(j);
499:                    if (annot instanceof  PDAnnotationLink) {
500:                        final PDRectangle rect = annot.getRectangle();
501:                        //need to reposition link rectangle to match text space plus add
502:                        //a little to account for descenders and the like
503:                        final float x = rect.getLowerLeftX() - 1;
504:                        float y = rect.getUpperRightY() - 1;
505:                        final float width = rect.getWidth() + 2;
506:                        final float height = rect.getHeight()
507:                                + rect.getHeight() / 4;
508:                        final int rotation = page.findRotation();
509:                        if (rotation == 0) {
510:                            final PDRectangle pageSize = page.findMediaBox();
511:                            y = pageSize.getHeight() - y;
512:                        }
513:
514:                        final Rectangle2D.Float awtRect = new Rectangle2D.Float(
515:                                x, y, width, height);
516:                        stripper.addRegion(Integer.toString(j), awtRect);
517:                        linkAnnotations.add(annot);
518:                        linkRegions.add(awtRect);
519:                    }
520:                }
521:                return annotations;
522:            }
523:
524:            public String getText(final String fragmentSeparator,
525:                    final String lineSeparator, final String pageSeparator,
526:                    final String mode) {
527:                return getText(0, getNumberOfPages(), fragmentSeparator,
528:                        lineSeparator, pageSeparator, mode);
529:            }
530:
531:            private String getText(final int startPage, final int endPage,
532:                    final String fragmentSeparator, final String lineSeparator,
533:                    final String pageSeparator, final String mode) {
534:                final StringBuffer buf = new StringBuffer();
535:                if (MODE_NORMAL.equals(mode)) {
536:                    buf.append(getTextInternal(startPage, endPage,
537:                            lineSeparator, pageSeparator));
538:                } else {
539:                    for (int page = startPage; page <= endPage; page++) {
540:                        final List fragments = getFragments(page,
541:                                fragmentSeparator, lineSeparator);
542:                        final String tmp = collateFragments(fragments,
543:                                fragmentSeparator, lineSeparator);
544:                        if (tmp.length() > 0) {
545:                            buf.append(tmp);
546:                            buf.append(pageSeparator);
547:                        }
548:                    }
549:                }
550:                return buf.toString();
551:            }
552:
553:            private String getTextInternal(final int startPage,
554:                    final int endPage, final String lineSeparator,
555:                    final String pageSeparator) {
556:                final StringWriter output = new StringWriter();
557:                try {
558:                    final PDFTextStripper textStripper = new PDFTextStripper();
559:                    textStripper.setPageSeparator(pageSeparator);
560:                    textStripper.setLineSeparator(lineSeparator);
561:                    textStripper.setStartPage(startPage);
562:                    textStripper.setEndPage(endPage);
563:                    textStripper.writeText(getPDFDocument(), output);
564:                    return output.toString();
565:                } catch (final Exception e) {
566:                    throw new RuntimeException(
567:                            "Error while extracting text from document.", e);
568:                } finally {
569:                    IOUtils.closeQuietly(output);
570:                }
571:            }
572:
573:            public List getFragments(int page, final String fragmentSeparator,
574:                    final String lineSeparator) {
575:                final List fragments = new ArrayList();
576:
577:                final StringWriter output = new StringWriter();
578:                try {
579:                    final PDFTextStripper textStripper = new PDFTextStripper() {
580:                        protected void showCharacter(TextPosition textPosition) {
581:                            fragments.add(textPosition);
582:                        }
583:                    };
584:                    textStripper.setLineSeparator(lineSeparator);
585:                    textStripper.setStartPage(page);
586:                    textStripper.setEndPage(page);
587:                    textStripper.writeText(getPDFDocument(), output);
588:                    return fragments;
589:                } catch (final Exception e) {
590:                    throw new RuntimeException(
591:                            "Error while extracting text from document.", e);
592:                } finally {
593:                    IOUtils.closeQuietly(output);
594:                }
595:            }
596:
597:            private String collateFragments(List fragments,
598:                    String fragmentSeparator, String lineSeparator) {
599:                final Map linesOfText = new TreeMap();
600:                regroup(fragments, linesOfText);
601:                final Map linesOfString = new TreeMap();
602:                coalesce(linesOfText, linesOfString);
603:                return fragmentsToString(linesOfString, fragmentSeparator,
604:                        lineSeparator);
605:            }
606:
607:            private void coalesce(Map linesOfText, Map linesOfString) {
608:                Iterator kit = linesOfText.keySet().iterator();
609:                while (kit.hasNext()) {
610:                    Integer key = (Integer) kit.next();
611:                    linesOfString.put(key, coalesceLine((Map) linesOfText
612:                            .get(key)));
613:                }
614:            }
615:
616:            private Map coalesceLine(Map input) {
617:                final Map output = new TreeMap();
618:                final Iterator kit = input.keySet().iterator();
619:                TextPosition lastFragment = null;
620:                String lastString = null;
621:                Integer lastKey = null;
622:                while (kit.hasNext()) {
623:                    final Integer key = (Integer) kit.next();
624:                    final TextPosition this Fragment = (TextPosition) input
625:                            .get(key);
626:                    if (lastFragment != null
627:                            && adjacent(lastFragment, this Fragment)) {
628:                        lastFragment = this Fragment;
629:                        lastString += this Fragment.getCharacter();
630:                    } else {
631:                        if (lastFragment != null) {
632:                            output.put(lastKey, lastString);
633:                        }
634:                        lastFragment = this Fragment;
635:                        lastString = this Fragment.getCharacter();
636:                        lastKey = key;
637:                    }
638:                    if (lastFragment != null) {
639:                        output.put(lastKey, lastString);
640:                    }
641:                }
642:                return output;
643:            }
644:
645:            private boolean adjacent(final TextPosition lastFragment,
646:                    final TextPosition this Fragment) {
647:                final int TOLERANCE = 2;
648:                return this Fragment.getX()
649:                        - (lastFragment.getX() + lastFragment.getWidth()
650:                                * lastFragment.getXScale()) < TOLERANCE;
651:            }
652:
653:            private void regroup(final List fragments, final Map lines) {
654:                for (int i = 0; i < fragments.size(); i++) {
655:                    final TextPosition textPosition = (TextPosition) fragments
656:                            .get(i);
657:                    final Integer y = new Integer((int) textPosition.getY());
658:                    final Integer x = new Integer((int) textPosition.getX());
659:                    final Map pieces;
660:                    if (lines.containsKey(y)) {
661:                        pieces = (TreeMap) lines.get(y);
662:                    } else {
663:                        pieces = new TreeMap();
664:                    }
665:                    pieces.put(x, textPosition);
666:                    lines.put(y, pieces);
667:                }
668:            }
669:
670:            private String fragmentsToString(Map linesOfString,
671:                    String fragmentSeparator, String lineSeparator) {
672:                StringBuffer buf = new StringBuffer();
673:                Iterator lit = linesOfString.values().iterator();
674:                while (lit.hasNext()) {
675:                    Map pieces = (Map) lit.next();
676:                    Iterator pit = pieces.values().iterator();
677:                    while (pit.hasNext()) {
678:                        String piece = (String) pit.next();
679:                        buf.append(piece);
680:                        if (pit.hasNext()) {
681:                            buf.append(fragmentSeparator);
682:                        }
683:                    }
684:                    buf.append(lineSeparator);
685:                }
686:                return buf.toString();
687:            }
688:
689:        }
w__w_w__.___java2_s__._c__o__m_ | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.