Source Code Cross Referenced for LinkStatusGenerator.java in  » Content-Management-System » apache-lenya-2.0 » org » apache » cocoon » generation » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Content Management System » apache lenya 2.0 » org.apache.cocoon.generation 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


001:        /*
002:         * Licensed to the Apache Software Foundation (ASF) under one or more
003:         * contributor license agreements.  See the NOTICE file distributed with
004:         * this work for additional information regarding copyright ownership.
005:         * The ASF licenses this file to You under the Apache License, Version 2.0
006:         * (the "License"); you may not use this file except in compliance with
007:         * the License.  You may obtain a copy of the License at
008:         *
009:         *      http://www.apache.org/licenses/LICENSE-2.0
010:         *
011:         * Unless required by applicable law or agreed to in writing, software
012:         * distributed under the License is distributed on an "AS IS" BASIS,
013:         * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014:         * See the License for the specific language governing permissions and
015:         * limitations under the License.
016:         */
017:        package org.apache.cocoon.generation;
018:
019:        import org.apache.avalon.framework.parameters.Parameters;
020:        import org.apache.avalon.framework.configuration.Configurable;
021:        import org.apache.avalon.framework.configuration.Configuration;
022:        import org.apache.avalon.framework.configuration.ConfigurationException;
023:        import org.apache.cocoon.ProcessingException;
024:        import org.apache.cocoon.ResourceNotFoundException;
025:        import org.apache.cocoon.environment.SourceResolver;
026:        import org.apache.cocoon.Constants;
027:        import org.apache.commons.lang.StringUtils;
028:        import org.apache.regexp.RE;
029:        import org.apache.regexp.RESyntaxException;
030:
031:        import org.xml.sax.SAXException;
032:        import org.xml.sax.helpers.AttributesImpl;
033:
034:        import java.io.IOException;
035:        import java.io.InputStream;
036:        import java.io.BufferedReader;
037:        import java.io.InputStreamReader;
038:        import java.net.URLConnection;
039:        import java.net.HttpURLConnection;
040:        import java.net.URL;
041:        import java.util.Map;
042:        import java.util.HashSet;
043:        import java.util.Iterator;
044:        import java.util.List;
045:        import java.util.ArrayList;
046:
047:        /**
048:         * @cocoon.sitemap.component.documentation
049:         * Generates a list of links that are reachable from the src and their status.
050:         *
051:         * @cocoon.sitemap.component.name   linkstatus
052:         * @cocoon.sitemap.component.label  content
053:         * @cocoon.sitemap.component.logger sitemap.generator.linkstatus
054:         *
055:         * @author Michael Homeijer
056:         * @author Nicola Ken Barozzi (nicolaken@apache.org)
057:         * @author Bernhard Huber (huber@apache.org)
058:         * @version $Id: LinkStatusGenerator.java 433543 2006-08-22 06:22:54Z crossley $
059:         */
060:        public class LinkStatusGenerator extends ServiceableGenerator implements 
061:                Configurable {
062:
063:            /** The URI of the namespace of this generator. */
064:            protected static final String URI = "http://apache.org/cocoon/linkstatus/2.0";
065:
066:            /** The namespace prefix for this namespace. */
067:            protected static final String PREFIX = "linkstatus";
068:
069:            /* Node and attribute names */
070:            protected static final String TOP_NODE_NAME = "linkstatus";
071:            protected static final String LINK_NODE_NAME = "link";
072:
073:            protected static final String HREF_ATTR_NAME = "href";
074:            protected static final String REFERRER_ATTR_NAME = "referrer";
075:            protected static final String CONTENT_ATTR_NAME = "content";
076:            protected static final String STATUS_ATTR_NAME = "status";
077:            protected static final String MESSAGE_ATTR_NAME = "message";
078:
079:            protected AttributesImpl attributes;
080:
081:            /**
082:             * Config element name specifying expected link content-typ.
083:             * <p>
084:             *   Its value is <code>link-content-type</code>.
085:             * </p>
086:             *
087:             * @since
088:             */
089:            public final static String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
090:
091:            /**
092:             * Default value of <code>link-content-type</code> configuration value.
093:             * <p>
094:             *   Its value is <code>application/x-cocoon-links</code>.
095:             * </p>
096:             *
097:             * @since
098:             */
099:            public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
100:
101:            /**
102:             * Config element name specifying query-string appendend for requesting links
103:             * of an URL.
104:             * <p>
105:             *  Its value is <code>link-view-query</code>.
106:             * </p>
107:             *
108:             * @since
109:             */
110:            public final static String LINK_VIEW_QUERY_CONFIG = "link-view-query";
111:            /**
112:             * Default value of <code>link-view-query</code> configuration value.
113:             * <p>
114:             *   Its value is <code>?cocoon-view=links</code>.
115:             * </p>
116:             *
117:             * @since
118:             */
119:            public final static String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
120:
121:            /**
122:             * Config element name specifying excluding regular expression pattern.
123:             * <p>
124:             *  Its value is <code>exclude</code>.
125:             * </p>
126:             *
127:             * @since
128:             */
129:            public final static String EXCLUDE_CONFIG = "exclude";
130:
131:            /**
132:             * Config element name specifying including regular expression pattern.
133:             * <p>
134:             *  Its value is <code>include</code>.
135:             * </p>
136:             *
137:             * @since
138:             */
139:            public final static String INCLUDE_CONFIG = "include";
140:
141:            /**
142:             * Config element name specifying http header value for user-Agent.
143:             * <p>
144:             *  Its value is <code>user-agent</code>.
145:             * </p>
146:             *
147:             * @since
148:             */
149:            public final static String USER_AGENT_CONFIG = "user-agent";
150:            /**
151:             * Default value of <code>user-agent</code> configuration value.
152:             *
153:             * @see org.apache.cocoon.Constants#COMPLETE_NAME
154:             * @since
155:             */
156:            public final static String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
157:
158:            /**
159:             * Config element name specifying http header value for accept.
160:             * <p>
161:             *  Its value is <code>accept</code>.
162:             * </p>
163:             *
164:             * @since
165:             */
166:            public final static String ACCEPT_CONFIG = "accept";
167:            /**
168:             * Default value of <code>accept</code> configuration value.
169:             * <p>
170:             *   Its value is <code>* / *</code>
171:             * </p>
172:             *
173:             * @since
174:             */
175:            public final static String ACCEPT_DEFAULT = "*/*";
176:
177:            private String linkViewQuery = LINK_VIEW_QUERY_DEFAULT;
178:            private String linkContentType = LINK_CONTENT_TYPE_DEFAULT;
179:            private HashSet excludeCrawlingURL;
180:            private HashSet includeCrawlingURL;
181:            // FIXME - The following two are never read, can we delete them?
182:            //private String userAgent = USER_AGENT_DEFAULT;
183:            //private String accept = ACCEPT_DEFAULT;
184:
185:            private HashSet crawled;
186:            private HashSet linksToProcess;
187:
188:            /**
189:             * Stores links to process and the referrer links
190:             */
191:            private static class Link {
192:                private URL url;
193:                private String referrer;
194:
195:                public Link(URL url, String referrer) {
196:                    this .url = url;
197:                    this .referrer = referrer;
198:                }
199:
200:                public URL getURL() {
201:                    return url;
202:                }
203:
204:                public String getReferrer() {
205:                    return referrer;
206:                }
207:
208:                public boolean equals(Link l) {
209:                    return url.equals(l.getURL());
210:                }
211:            }
212:
213:            /**
214:             * Configure the crawler component.
215:             * <p>
216:             *  Configure can specify which URI to include, and which URI to exclude
217:             *  from crawling. You specify the patterns as regular expressions.
218:             * </p>
219:             * <p>
220:             *  Morover you can configure
221:             *  the required content-type of crawling request, and the
222:             *  query-string appended to each crawling request.
223:             * </p>
224:             * <pre><tt>
225:             * &lt;include&gt;.*\.html?&lt;/include&gt; or &lt;include&gt;.*\.html?, .*\.xsp&lt;/include&gt;
226:             * &lt;exclude&gt;.*\.gif&lt;/exclude&gt; or &lt;exclude&gt;.*\.gif, .*\.jpe?g&lt;/exclude&gt;
227:             * &lt;link-content-type&gt; application/x-cocoon-links &lt;/link-content-type&gt;
228:             * &lt;link-view-query&gt; ?cocoon-view=links &lt;/link-view-query&gt;
229:             * &lt;user-agent&gt; Cocoon &lt;/user-agent&gt;
230:             * &lt;accept&gt; text/xml &lt;/accept&gt;
231:             * </tt></pre>
232:             *
233:             * @param  configuration               XML configuration of this avalon component.
234:             * @exception  ConfigurationException  is throwing if configuration is invalid.
235:             * @since
236:             */
237:            public void configure(Configuration configuration)
238:                    throws ConfigurationException {
239:
240:                Configuration[] children;
241:                children = configuration.getChildren(INCLUDE_CONFIG);
242:                if (children.length > 0) {
243:                    includeCrawlingURL = new HashSet();
244:                    for (int i = 0; i < children.length; i++) {
245:                        String pattern = children[i].getValue();
246:                        try {
247:                            String params[] = StringUtils.split(pattern, ", ");
248:                            for (int index = 0; index < params.length; index++) {
249:                                String tokenized_pattern = params[index];
250:                                this .includeCrawlingURL.add(new RE(
251:                                        tokenized_pattern));
252:                            }
253:                        } catch (RESyntaxException rese) {
254:                            getLogger().error(
255:                                    "Cannot create including regular-expression for "
256:                                            + pattern, rese);
257:                        }
258:                    }
259:                }
260:
261:                children = configuration.getChildren(EXCLUDE_CONFIG);
262:                if (children.length > 0) {
263:                    excludeCrawlingURL = new HashSet();
264:                    for (int i = 0; i < children.length; i++) {
265:                        String pattern = children[i].getValue();
266:                        try {
267:                            String params[] = StringUtils.split(pattern, ", ");
268:                            for (int index = 0; index < params.length; index++) {
269:                                String tokenized_pattern = params[index];
270:                                this .excludeCrawlingURL.add(new RE(
271:                                        tokenized_pattern));
272:                            }
273:                        } catch (RESyntaxException rese) {
274:                            getLogger().error(
275:                                    "Cannot create excluding regular-expression for "
276:                                            + pattern, rese);
277:                        }
278:                    }
279:                } else {
280:                    excludeCrawlingURL = new HashSet();
281:                    setDefaultExcludeFromCrawling();
282:                }
283:
284:                Configuration child;
285:                String value;
286:                child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false);
287:                if (child != null) {
288:                    value = child.getValue();
289:                    if (value != null && value.length() > 0) {
290:                        this .linkContentType = value.trim();
291:                    }
292:                }
293:                child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false);
294:                if (child != null) {
295:                    value = child.getValue();
296:                    if (value != null && value.length() > 0) {
297:                        this .linkViewQuery = value.trim();
298:                    }
299:                }
300:                /*      FIXME: Also delete this if you delete the fields above.
301:                 child = configuration.getChild(USER_AGENT_CONFIG, false);
302:                 if (child != null) {
303:                 value = child.getValue();
304:                 if (value != null && value.length() > 0) {
305:                 this.userAgent = value;
306:                 }
307:                 }
308:
309:                 child = configuration.getChild(ACCEPT_CONFIG, false);
310:                 if (child != null) {
311:                 value = child.getValue();
312:                 if (value != null && value.length() > 0) {
313:                 this.accept = value;
314:                 }
315:                 }
316:                 */
317:            }
318:
319:            public void setup(SourceResolver resolver, Map objectModel,
320:                    String src, Parameters par) throws ProcessingException,
321:                    SAXException, IOException {
322:
323:                super .setup(resolver, objectModel, src, par);
324:
325:                /* Create a reusable attributes for creating nodes */
326:                this .attributes = new AttributesImpl();
327:
328:                // already done in configure...
329:                //excludeCrawlingURL = new HashSet();
330:                //this.setDefaultExcludeFromCrawling();
331:            }
332:
333:            /**
334:             * Generate XML data.
335:             *
336:             * @throws  SAXException
337:             *      if an error occurs while outputting the document
338:             * @throws  ProcessingException
339:             *      if the requsted URI wasn't found
340:             */
341:            public void generate() throws SAXException, ProcessingException {
342:                try {
343:
344:                    crawled = new HashSet();
345:                    linksToProcess = new HashSet();
346:
347:                    URL root = new URL(source);
348:                    linksToProcess.add(new Link(root, ""));
349:
350:                    if (getLogger().isDebugEnabled()) {
351:                        getLogger().debug("crawl URL " + root);
352:                    }
353:
354:                    this .contentHandler.startDocument();
355:                    this .contentHandler.startPrefixMapping(PREFIX, URI);
356:
357:                    attributes.clear();
358:                    super .contentHandler.startElement(URI, TOP_NODE_NAME,
359:                            PREFIX + ':' + TOP_NODE_NAME, attributes);
360:
361:                    while (linksToProcess.size() > 0) {
362:                        Iterator i = linksToProcess.iterator();
363:
364:                        if (i.hasNext()) {
365:                            // fetch a URL
366:                            Link link = (Link) i.next();
367:                            URL url = link.getURL();
368:
369:                            // remove it from the to-do list
370:                            linksToProcess.remove(link);
371:
372:                            String new_url_link = processURL(url, link
373:                                    .getReferrer());
374:
375:                            // calc all links from this url
376:                            if (new_url_link != null) {
377:
378:                                List url_links = getLinksFromConnection(
379:                                        new_url_link, url);
380:                                if (url_links != null) {
381:                                    // add links of this url to the to-do list
382:                                    linksToProcess.addAll(url_links);
383:                                }
384:                            }
385:                        }
386:                    }
387:
388:                    super .contentHandler.endElement(URI, TOP_NODE_NAME, PREFIX
389:                            + ':' + TOP_NODE_NAME);
390:                    this .contentHandler.endPrefixMapping(PREFIX);
391:                    this .contentHandler.endDocument();
392:                } catch (IOException ioe) {
393:                    getLogger().warn("Could not read source ", ioe);
394:                    throw new ResourceNotFoundException(
395:                            "Could not read source ", ioe);
396:                }
397:            }
398:
399:            /**
400:             * Default exclude patterns.
401:             * <p>
402:             *   By default URLs matching following patterns are excluded:
403:             * </p>
404:             * <ul>
405:             *   <li>.*\\.gif(\\?.*)?$ - exclude gif images</li>
406:             *   <li>.*\\.png(\\?.*)?$ - exclude png images</li>
407:             *   <li>.*\\.jpe?g(\\?.*)?$ - exclude jpeg images</li>
408:             *   <li>.*\\.js(\\?.*)?$ - exclude javascript </li>
409:             *   <li>.*\\.css(\\?.*)?$ - exclude cascaded stylesheets</li>
410:             * </ul>
411:             *
412:             * @since
413:             */
414:            private void setDefaultExcludeFromCrawling() {
415:                String[] EXCLUDE_FROM_CRAWLING_DEFAULT = { ".*\\.gif(\\?.*)?$",
416:                        ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$",
417:                        ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$" };
418:
419:                for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; i++) {
420:                    String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
421:                    try {
422:                        excludeCrawlingURL.add(new RE(pattern));
423:                    } catch (RESyntaxException rese) {
424:                        getLogger().error(
425:                                "Cannot create excluding regular-expression for "
426:                                        + pattern, rese);
427:                    }
428:                }
429:            }
430:
431:            /**
432:             * Retrieve a list of links of a url
433:             *
434:             * @param url_link_string url for requesting links, it is assumed that
435:             *   url_link_string queries the cocoon view links, ie of the form
436:             *   <code>http://host/foo/bar?cocoon-view=links</code>
437:             * @param url_of_referrer base url of which links are requested, ie of the form
438:             *   <code>http://host/foo/bar</code>
439:             * @return List of links from url_of_referrer, as result of requesting url
440:             *   url_link_string
441:             */
442:            protected List getLinksFromConnection(String url_link_string,
443:                    URL url_of_referrer) {
444:                List url_links = null;
445:                BufferedReader br = null;
446:                try {
447:                    URL url_link = new URL(url_link_string);
448:                    URLConnection conn = url_link.openConnection();
449:                    String content_type = conn.getContentType();
450:
451:                    if (content_type == null) {
452:                        getLogger().warn(
453:                                "No content type available for "
454:                                        + String.valueOf(url_link_string));
455:                        // caller checks if null
456:                        return url_links;
457:                    }
458:
459:                    if (getLogger().isDebugEnabled()) {
460:                        getLogger().debug("Content-type: " + content_type);
461:                    }
462:
463:                    if (content_type.equals(linkContentType)
464:                            || content_type.startsWith(linkContentType + ";")) {
465:                        url_links = new ArrayList();
466:
467:                        InputStream is = conn.getInputStream();
468:                        br = new BufferedReader(new InputStreamReader(is));
469:
470:                        // content is supposed to be a list of links,
471:                        // relative to current URL
472:                        String line;
473:                        String referrer = url_of_referrer.toString();
474:
475:                        while ((line = br.readLine()) != null) {
476:                            URL new_url = new URL(url_link, line);
477:                            boolean add_url = true;
478:                            // don't add new_url twice
479:                            if (add_url) {
480:                                add_url &= !url_links.contains(new_url);
481:                            }
482:
483:                            // don't add new_url if it has been crawled already
484:                            if (add_url) {
485:                                add_url &= !crawled
486:                                        .contains(new_url.toString());
487:                            }
488:
489:                            Link new_link = new Link(new_url, referrer);
490:                            if (add_url) {
491:                                add_url &= !linksToProcess.contains(new_link);
492:                            }
493:
494:                            // don't add if is not matched by existing include definition
495:                            if (add_url) {
496:                                add_url &= isIncludedURL(new_url.toString());
497:                            }
498:
499:                            if (add_url) {
500:                                if (getLogger().isDebugEnabled()) {
501:                                    getLogger().debug(
502:                                            "Add URL: " + new_url.toString());
503:                                }
504:                                url_links.add(new_link);
505:                            }
506:                        }
507:                        // now we have a list of URL which should be examined
508:                    }
509:                } catch (IOException ioe) {
510:                    getLogger().warn(
511:                            "Problems get links of " + url_link_string, ioe);
512:                } finally {
513:                    // explictly close the stream
514:                    if (br != null) {
515:                        try {
516:                            br.close();
517:                            br = null;
518:                        } catch (IOException ignored) {
519:                        }
520:                    }
521:                }
522:                return url_links;
523:            }
524:
525:            /**
526:             * Generate xml attributes of a url, calculate url for retrieving links
527:             *
528:             * @param url to process
529:             * @param referrer of the url
530:             * @return String url for retrieving links, or null if url is an excluded-url,
531:             *   and not an included-url.
532:             */
533:            protected String processURL(URL url, String referrer)
534:                    throws SAXException {
535:
536:                if (getLogger().isDebugEnabled()) {
537:                    getLogger().debug("getLinks URL " + url);
538:                }
539:
540:                String result = null;
541:
542:                // don't try to investigate a url which has been crawled already
543:                if (crawled.contains(url.toString())) {
544:                    return null;
545:                }
546:
547:                // mark it as crawled
548:                crawled.add(url.toString());
549:
550:                attributes.clear();
551:                attributes.addAttribute("", HREF_ATTR_NAME, HREF_ATTR_NAME,
552:                        "CDATA", url.toString());
553:                attributes.addAttribute("", REFERRER_ATTR_NAME,
554:                        REFERRER_ATTR_NAME, "CDATA", referrer);
555:
556:                // Output url, referrer, content-type, status, message for traversable url's
557:                HttpURLConnection h = null;
558:                try {
559:
560:                    URLConnection links_url_connection = url.openConnection();
561:                    h = (HttpURLConnection) links_url_connection;
562:                    String content_type = links_url_connection.getContentType();
563:
564:                    attributes.addAttribute("", CONTENT_ATTR_NAME,
565:                            CONTENT_ATTR_NAME, "CDATA", content_type);
566:
567:                    attributes.addAttribute("", MESSAGE_ATTR_NAME,
568:                            MESSAGE_ATTR_NAME, "CDATA", h.getResponseMessage());
569:
570:                    attributes.addAttribute("", STATUS_ATTR_NAME,
571:                            STATUS_ATTR_NAME, "CDATA", String.valueOf(h
572:                                    .getResponseCode()));
573:                } catch (IOException ioe) {
574:                    attributes.addAttribute("", MESSAGE_ATTR_NAME,
575:                            MESSAGE_ATTR_NAME, "CDATA", ioe.getMessage());
576:                } finally {
577:                    if (h != null) {
578:                        h.disconnect();
579:                    }
580:                }
581:
582:                // don't try to get links of a url which is excluded from crawling
583:                // try to get links of a url which is included for crawling
584:                if (!isExcludedURL(url.toString())
585:                        && isIncludedURL(url.toString())) {
586:                    // add prefix and query to get data from the linkserializer.
587:                    result = url.toExternalForm()
588:                            + ((url.toExternalForm().indexOf("?") == -1) ? "?"
589:                                    : "&") + linkViewQuery;
590:                }
591:
592:                super .contentHandler.startElement(URI, LINK_NODE_NAME, PREFIX
593:                        + ':' + LINK_NODE_NAME, attributes);
594:                super .contentHandler.endElement(URI, LINK_NODE_NAME, PREFIX
595:                        + ':' + LINK_NODE_NAME);
596:
597:                return result;
598:            }
599:
600:            /**
601:             * check if URL is a candidate for indexing
602:             *
603:             * @param  url  Description of Parameter
604:             * @return      The excludedURL value
605:             * @since
606:             */
607:            private boolean isExcludedURL(String url) {
608:                // by default include URL for crawling
609:                if (excludeCrawlingURL == null) {
610:                    if (getLogger().isDebugEnabled()) {
611:                        getLogger().debug("exclude no URL " + url);
612:                    }
613:                    return false;
614:                }
615:
616:                final String s = url;
617:                Iterator i = excludeCrawlingURL.iterator();
618:                while (i.hasNext()) {
619:                    RE pattern = (RE) i.next();
620:                    if (pattern.match(s)) {
621:                        if (getLogger().isDebugEnabled()) {
622:                            getLogger().debug("exclude URL " + url);
623:                        }
624:                        return true;
625:                    }
626:                }
627:                if (getLogger().isDebugEnabled()) {
628:                    getLogger().debug("exclude not URL " + url);
629:                }
630:                return false;
631:            }
632:
633:            /**
634:             * check if URL is a candidate for indexing
635:             *
636:             * @param  url  Description of Parameter
637:             * @return      The includedURL value
638:             * @since
639:             */
640:            private boolean isIncludedURL(String url) {
641:                // by default include URL for crawling
642:                if (includeCrawlingURL == null) {
643:                    if (getLogger().isDebugEnabled()) {
644:                        getLogger().debug("include all URL " + url);
645:                    }
646:                    return true;
647:                }
648:
649:                final String s = url;
650:                Iterator i = includeCrawlingURL.iterator();
651:                while (i.hasNext()) {
652:                    RE pattern = (RE) i.next();
653:                    if (pattern.match(s)) {
654:                        if (getLogger().isDebugEnabled()) {
655:                            getLogger().debug("include URL " + url);
656:                        }
657:                        return true;
658:                    }
659:                }
660:                if (getLogger().isDebugEnabled()) {
661:                    getLogger().debug("include not URL " + url);
662:                }
663:                return false;
664:            }
665:
666:            public void recycle() {
667:                super.recycle();
668:
669:                this.attributes = null;
670:            }
671:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.