Source Code Cross Referenced for Heritrix.java in  » Web-Crawler » heritrix » org » archive » crawler » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1. 6.0 JDK Core
2. 6.0 JDK Modules
3. 6.0 JDK Modules com.sun
4. 6.0 JDK Modules com.sun.java
5. 6.0 JDK Modules sun
6. 6.0 JDK Platform
7. Ajax
8. Apache Harmony Java SE
9. Aspect oriented
10. Authentication Authorization
11. Blogger System
12. Build
13. Byte Code
14. Cache
15. Chart
16. Chat
17. Code Analyzer
18. Collaboration
19. Content Management System
20. Database Client
21. Database DBMS
22. Database JDBC Connection Pool
23. Database ORM
24. Development
25. EJB Server geronimo
26. EJB Server GlassFish
27. EJB Server JBoss 4.2.1
28. EJB Server resin 3.1.5
29. ERP CRM Financial
30. ESB
31. Forum
32. GIS
33. Graphic Library
34. Groupware
35. HTML Parser
36. IDE
37. IDE Eclipse
38. IDE Netbeans
39. Installer
40. Internationalization Localization
41. Inversion of Control
42. Issue Tracking
43. J2EE
44. JBoss
45. JMS
46. JMX
47. Library
48. Mail Clients
49. Net
50. Parser
51. PDF
52. Portal
53. Profiler
54. Project Management
55. Report
56. RSS RDF
57. Rule Engine
58. Science
59. Scripting
60. Search Engine
61. Security
62. Sevlet Container
63. Source Control
64. Swing Library
65. Template Engine
66. Test Coverage
67. Testing
68. UML
69. Web Crawler
70. Web Framework
71. Web Mail
72. Web Server
73. Web Services
74. Web Services apache cxf 2.0.1
75. Web Services AXIS2
76. Wiki Engine
77. Workflow Engines
78. XML
79. XML UI
Java
Java Tutorial
Java Open Source
Jar File Download
Java Articles
Java Products
Java by API
Photoshop Tutorials
Maya Tutorials
Flash Tutorials
3ds-Max Tutorials
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
ASP.Net
ASP.NET Tutorial
JavaScript DHTML
JavaScript Tutorial
JavaScript Reference
HTML / CSS
HTML CSS Reference
C / ANSI-C
C Tutorial
C++
C++ Tutorial
Ruby
PHP
Python
Python Tutorial
Python Open Source
SQL Server / T-SQL
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
PostgreSQL
SQL / MySQL
MySQL Tutorial
VB.Net
VB.Net Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Web Crawler » heritrix » org.archive.crawler 
Source Cross Referenced  Class Diagram Java Document (Java Doc) 


0001:        /* Heritrix
0002:         *
0003:         * $Id: Heritrix.java 4858 2007-01-15 23:37:08Z stack-sf $
0004:         *
0005:         * Created on May 15, 2003
0006:         *
0007:         * Copyright (C) 2003 Internet Archive.
0008:         *
0009:         * This file is part of the Heritrix web crawler (crawler.archive.org).
0010:         *
0011:         * Heritrix is free software; you can redistribute it and/or modify
0012:         * it under the terms of the GNU Lesser Public License as published by
0013:         * the Free Software Foundation; either version 2.1 of the License, or
0014:         * any later version.
0015:         *
0016:         * Heritrix is distributed in the hope that it will be useful,
0017:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
0018:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
0019:         * GNU Lesser Public License for more details.
0020:         *
0021:         * You should have received a copy of the GNU Lesser Public License
0022:         * along with Heritrix; if not, write to the Free Software
0023:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
0024:         */
0025:        package org.archive.crawler;
0026:
0027:        import java.io.File;
0028:        import java.io.FileInputStream;
0029:        import java.io.FileNotFoundException;
0030:        import java.io.FileOutputStream;
0031:        import java.io.IOException;
0032:        import java.io.InputStream;
0033:        import java.io.PrintStream;
0034:        import java.io.PrintWriter;
0035:        import java.net.HttpURLConnection;
0036:        import java.net.InetAddress;
0037:        import java.net.URL;
0038:        import java.net.URLConnection;
0039:        import java.net.UnknownHostException;
0040:        import java.util.ArrayList;
0041:        import java.util.Arrays;
0042:        import java.util.Collection;
0043:        import java.util.Collections;
0044:        import java.util.Enumeration;
0045:        import java.util.Hashtable;
0046:        import java.util.Iterator;
0047:        import java.util.List;
0048:        import java.util.Map;
0049:        import java.util.Properties;
0050:        import java.util.StringTokenizer;
0051:        import java.util.TimeZone;
0052:        import java.util.Vector;
0053:        import java.util.logging.Level;
0054:        import java.util.logging.LogManager;
0055:        import java.util.logging.Logger;
0056:
0057:        import javax.management.Attribute;
0058:        import javax.management.AttributeList;
0059:        import javax.management.AttributeNotFoundException;
0060:        import javax.management.DynamicMBean;
0061:        import javax.management.InstanceAlreadyExistsException;
0062:        import javax.management.InstanceNotFoundException;
0063:        import javax.management.InvalidAttributeValueException;
0064:        import javax.management.MBeanInfo;
0065:        import javax.management.MBeanNotificationInfo;
0066:        import javax.management.MBeanOperationInfo;
0067:        import javax.management.MBeanRegistration;
0068:        import javax.management.MBeanRegistrationException;
0069:        import javax.management.MBeanServer;
0070:        import javax.management.MBeanServerFactory;
0071:        import javax.management.MalformedObjectNameException;
0072:        import javax.management.NotCompliantMBeanException;
0073:        import javax.management.ObjectName;
0074:        import javax.management.ReflectionException;
0075:        import javax.management.RuntimeOperationsException;
0076:        import javax.management.openmbean.CompositeData;
0077:        import javax.management.openmbean.CompositeDataSupport;
0078:        import javax.management.openmbean.CompositeType;
0079:        import javax.management.openmbean.OpenDataException;
0080:        import javax.management.openmbean.OpenMBeanAttributeInfoSupport;
0081:        import javax.management.openmbean.OpenMBeanConstructorInfoSupport;
0082:        import javax.management.openmbean.OpenMBeanInfoSupport;
0083:        import javax.management.openmbean.OpenMBeanOperationInfoSupport;
0084:        import javax.management.openmbean.OpenMBeanParameterInfo;
0085:        import javax.management.openmbean.OpenMBeanParameterInfoSupport;
0086:        import javax.management.openmbean.OpenType;
0087:        import javax.management.openmbean.SimpleType;
0088:        import javax.management.openmbean.TabularData;
0089:        import javax.management.openmbean.TabularDataSupport;
0090:        import javax.management.openmbean.TabularType;
0091:        import javax.naming.CompoundName;
0092:        import javax.naming.Context;
0093:        import javax.naming.NameNotFoundException;
0094:        import javax.naming.NamingException;
0095:        import javax.naming.NoInitialContextException;
0096:
0097:        import org.apache.commons.cli.Option;
0098:        import org.archive.crawler.admin.CrawlJob;
0099:        import org.archive.crawler.admin.CrawlJobErrorHandler;
0100:        import org.archive.crawler.admin.CrawlJobHandler;
0101:        import org.archive.crawler.datamodel.CredentialStore;
0102:        import org.archive.crawler.datamodel.credential.Credential;
0103:        import org.archive.crawler.event.CrawlStatusListener;
0104:        import org.archive.crawler.framework.AlertManager;
0105:        import org.archive.crawler.framework.CrawlController;
0106:        import org.archive.crawler.framework.exceptions.FatalConfigurationException;
0107:        import org.archive.crawler.framework.exceptions.InitializationException;
0108:        import org.archive.crawler.selftest.SelfTestCrawlJobHandler;
0109:        import org.archive.crawler.settings.XMLSettingsHandler;
0110:        import org.archive.io.SinkHandler;
0111:        import org.archive.io.SinkHandlerLogRecord;
0112:        import org.archive.net.UURI;
0113:        import org.archive.util.FileUtils;
0114:        import org.archive.util.IoUtils;
0115:        import org.archive.util.JmxUtils;
0116:        import org.archive.util.JndiUtils;
0117:        import org.archive.util.PropertyUtils;
0118:        import org.archive.util.TextUtils;
0119:
0120:        import sun.net.www.protocol.file.FileURLConnection;
0121:
0122:        /**
0123:         * Main class for Heritrix crawler.
0124:         *
0125:         * Heritrix is usually launched by a shell script that backgrounds heritrix
0126:         * that redirects all stdout and stderr emitted by heritrix to a log file.  So
0127:         * that startup messages emitted subsequent to the redirection of stdout and
0128:         * stderr show on the console, this class prints usage or startup output
0129:         * such as where the web UI can be found, etc., to a STARTLOG that the shell
0130:         * script is waiting on.  As soon as the shell script sees output in this file,
0131:         * it prints its content and breaks out of its wait.
0132:         * See ${HERITRIX_HOME}/bin/heritrix.
0133:         * 
0134:         * <p>Heritrix can also be embedded or launched by webapp initialization or
0135:         * by JMX bootstrapping.  So far I count 4 methods of instantiation:
0136:         * <ol>
0137:         * <li>From this classes main -- the method usually used;</li>
0138:         * <li>From the Heritrix UI (The local-instances.jsp) page;</li>
0139:         * <li>A creation by a JMX agent at the behest of a remote JMX client; and</li>
0140:         * <li>A container such as tomcat or jboss.</li>
0141:         * </ol>
0142:         *
0143:         * @author gojomo
0144:         * @author Kristinn Sigurdsson
0145:         * @author Stack
0146:         */
0147:        public class Heritrix implements  DynamicMBean, MBeanRegistration {
0148:            /**
0149:             * Heritrix logging instance.
0150:             */
0151:            private static final Logger logger = Logger
0152:                    .getLogger(Heritrix.class.getName());
0153:
0154:            private static final File TMPDIR = new File(System.getProperty(
0155:                    "java.io.tmpdir", "/tmp"));
0156:
0157:            /**
0158:             * Name of the heritrix properties file.
0159:             */
0160:            private static final String PROPERTIES = "heritrix.properties";
0161:
0162:            /**
0163:             * Name of the key to use specifying alternate heritrix properties on
0164:             * command line.
0165:             */
0166:            private static final String PROPERTIES_KEY = PROPERTIES;
0167:
0168:            /**
0169:             * Prefix used on properties we'll add to the System.properties list.
0170:             */
0171:            private static final String HERITRIX_PROPERTIES_PREFIX = "heritrix.";
0172:
0173:            /**
0174:             * Instance of web server if one was started.
0175:             */
0176:            private static SimpleHttpServer httpServer = null;
0177:
0178:            /**
0179:             * CrawlJob handler. Manages multiple crawl jobs at runtime.
0180:             */
0181:            private CrawlJobHandler jobHandler = null;
0182:
0183:            /**
0184:             * Heritrix start log file.
0185:             *
0186:             * This file contains standard out produced by this main class for startup
0187:             * only.  Used by heritrix shell script.  Name here MUST match that in the
0188:             * <code>bin/heritrix</code> shell script.  This is a DEPENDENCY the shell
0189:             * wrapper has on this here java heritrix.
0190:             */
0191:            private static final String STARTLOG = "heritrix_dmesg.log";
0192:
0193:            /**
0194:             * Default encoding.
0195:             * 
0196:             * Used for content when fetching if none specified.
0197:             */
0198:            public static final String DEFAULT_ENCODING = "ISO-8859-1";
0199:
0200:            /**
0201:             * Heritrix stderr/stdout log file.
0202:             *
0203:             * This file should have nothing in it except messages over which we have
0204:             * no control (JVM stacktrace, 3rd-party lib emissions).  The wrapper
0205:             * startup script directs stderr/stdout here. This is an INTERDEPENDENCY
0206:             * this program has with the wrapper shell script.  Shell can actually
0207:             * pass us an alternate to use for this file.
0208:             */
0209:            private static String DEFAULT_HERITRIX_OUT = "heritrix_out.log";
0210:
0211:            /**
0212:             * Where to write this classes startup output.
0213:             * 
0214:             * This out should only be used if Heritrix is being run from the
0215:             * command-line.
0216:             */
0217:            private static PrintWriter out = null;
0218:
0219:            /**
0220:             * The org.archive package
0221:             */
0222:            private static final String ARCHIVE_PACKAGE = "org.archive.";
0223:
0224:            /**
0225:             * The crawler package.
0226:             */
0227:            private static final String CRAWLER_PACKAGE = Heritrix.class
0228:                    .getName().substring(0,
0229:                            Heritrix.class.getName().lastIndexOf('.'));
0230:
0231:            /**
0232:             * The root context for a webapp.
0233:             */
0234:            private static final String ROOT_CONTEXT = "/";
0235:
0236:            /**
0237:             * Set to true if application is started from command line.
0238:             */
0239:            private static boolean commandLine = false;
0240:
0241:            /**
0242:             * True if container initialization has been run.
0243:             */
0244:            private static boolean containerInitialized = false;
0245:
0246:            /**
0247:             * True if properties have been loaded.
0248:             */
0249:            private static boolean propertiesLoaded = false;
0250:
0251:            private static final String JAR_SUFFIX = ".jar";
0252:
0253:            private AlertManager alertManager;
0254:
0255:            /**
0256:             * The context of the GUI webapp.  Default is root.
0257:             */
0258:            private static String adminContext = ROOT_CONTEXT;
0259:
0260:            /**
0261:             * True if we're to put up a GUI.
0262:             * Cmdline processing can override.
0263:             */
0264:            private static boolean gui = !PropertyUtils
0265:                    .getBooleanProperty("heritrix.cmdline.nowui");
0266:
0267:            /**
0268:             * Port to put the GUI up on.
0269:             * Cmdline processing can override.
0270:             */
0271:            private static int guiPort = SimpleHttpServer.DEFAULT_PORT;
0272:
0273:            /**
0274:             * A collection containing only localhost.  Used as default value
0275:             * for guiHosts, and passed to SimpleHttpServer when doing selftest.
0276:             */
0277:            final private static Collection<String> LOCALHOST_ONLY = Collections
0278:                    .unmodifiableList(Arrays
0279:                            .asList(new String[] { "127.0.0.1" }));
0280:
0281:            /**
0282:             * Hosts to bind the GUI webserver to.
0283:             * By default, only contans localhost.
0284:             * Set to an empty collection to indicate that all available network
0285:             * interfaces should be used for the webserver.
0286:             */
0287:            private static Collection<String> guiHosts = LOCALHOST_ONLY;
0288:
0289:            /**
0290:             * Web UI server, realm, context name.
0291:             */
0292:            private static String ADMIN = "admin";
0293:
0294:            // OpenMBean support.
0295:            /**
0296:             * The MBean server we're registered with (May be null).
0297:             */
0298:            private MBeanServer mbeanServer = null;
0299:
0300:            /**
0301:             * MBean name we were registered as.
0302:             */
0303:            private ObjectName mbeanName = null;
0304:
0305:            /**
0306:             * Keep reference to all instances of Heritrix.
0307:             * Used by the UI to figure which of the local Heritrice it should
0308:             * be going against and to figure what to shutdown on the way out (If
0309:             * there was always a JMX Agent, we wouldn't need to keep this list.  We
0310:             * could always ask the JMX Agent for all instances. UPDATE: True we could
0311:             * always ask the JMX Agent but we might keep around this local reference
0312:             * because it will allow faster, less awkward -- think of marshalling the args
0313:             * for JMX invoke operation -- access to local Heritrix instances.  A new
0314:             * usage for this instances Map is in CrawlJob#preRegister to find the hosting
0315:             * Heritrix instance).
0316:             */
0317:            private static Map<String, Heritrix> instances = new Hashtable<String, Heritrix>();
0318:
0319:            private OpenMBeanInfoSupport openMBeanInfo;
0320:            private final static String STATUS_ATTR = "Status";
0321:            private final static String VERSION_ATTR = "Version";
0322:            private final static String ISRUNNING_ATTR = "IsRunning";
0323:            private final static String ISCRAWLING_ATTR = "IsCrawling";
0324:            private final static String ALERTCOUNT_ATTR = "AlertCount";
0325:            private final static String NEWALERTCOUNT_ATTR = "NewAlertCount";
0326:            private final static String CURRENTJOB_ATTR = "CurrentJob";
0327:            private final static List ATTRIBUTE_LIST;
0328:            static {
0329:                ATTRIBUTE_LIST = Arrays.asList(new String[] { STATUS_ATTR,
0330:                        VERSION_ATTR, ISRUNNING_ATTR, ISCRAWLING_ATTR,
0331:                        ALERTCOUNT_ATTR, NEWALERTCOUNT_ATTR, CURRENTJOB_ATTR });
0332:            }
0333:
0334:            private final static String START_OPER = "start";
0335:            private final static String STOP_OPER = "stop";
0336:            private final static String DESTROY_OPER = "destroy";
0337:            private final static String INTERRUPT_OPER = "interrupt";
0338:            private final static String START_CRAWLING_OPER = "startCrawling";
0339:            private final static String STOP_CRAWLING_OPER = "stopCrawling";
0340:            private final static String ADD_CRAWL_JOB_OPER = "addJob";
0341:            private final static String TERMINATE_CRAWL_JOB_OPER = "terminateCurrentJob";
0342:            private final static String DELETE_CRAWL_JOB_OPER = "deleteJob";
0343:            private final static String ALERT_OPER = "alert";
0344:            private final static String ADD_CRAWL_JOB_BASEDON_OPER = "addJobBasedon";
0345:            private final static String PENDING_JOBS_OPER = "pendingJobs";
0346:            private final static String COMPLETED_JOBS_OPER = "completedJobs";
0347:            private final static String CRAWLEND_REPORT_OPER = "crawlendReport";
0348:            private final static String SHUTDOWN_OPER = "shutdown";
0349:            private final static String LOG_OPER = "log";
0350:            private final static String REBIND_JNDI_OPER = "rebindJNDI";
0351:            private final static List OPERATION_LIST;
0352:            static {
0353:                OPERATION_LIST = Arrays.asList(new String[] { START_OPER,
0354:                        STOP_OPER, INTERRUPT_OPER, START_CRAWLING_OPER,
0355:                        STOP_CRAWLING_OPER, ADD_CRAWL_JOB_OPER,
0356:                        ADD_CRAWL_JOB_BASEDON_OPER, DELETE_CRAWL_JOB_OPER,
0357:                        ALERT_OPER, PENDING_JOBS_OPER, COMPLETED_JOBS_OPER,
0358:                        CRAWLEND_REPORT_OPER, SHUTDOWN_OPER, LOG_OPER,
0359:                        DESTROY_OPER, TERMINATE_CRAWL_JOB_OPER,
0360:                        REBIND_JNDI_OPER });
0361:            }
0362:            private CompositeType jobCompositeType = null;
0363:            private TabularType jobsTabularType = null;
0364:            private static final String[] JOB_KEYS = new String[] { "uid",
0365:                    "name", "status" };
0366:
0367:            private static String adminUsername;
0368:
0369:            private static String adminPassword;
0370:
0371:            /**
0372:             * Constructor.
0373:             * Does not register the created instance with JMX.  Assumed this
0374:             * constructor is used by such as JMX agent creating an instance of
0375:             * Heritrix at the commmand of a remote client (In this case Heritrix will
0376:             * be registered by the invoking agent).
0377:             * @throws IOException
0378:             */
0379:            public Heritrix() throws IOException {
0380:                this (null, false);
0381:            }
0382:
0383:            public Heritrix(final boolean jmxregister) throws IOException {
0384:                this (null, jmxregister);
0385:            }
0386:
0387:            /**
0388:             * Constructor.
0389:             * @param name If null, we bring up the default Heritrix instance.
0390:             * @param jmxregister True if we are to register this instance with JMX
0391:             * agent.
0392:             * @throws IOException
0393:             */
0394:            public Heritrix(final String name, final boolean jmxregister)
0395:                    throws IOException {
0396:                this (name, jmxregister, new CrawlJobHandler(getJobsdir()));
0397:            }
0398:
0399:            /**
0400:             * Constructor.
0401:             * @param name If null, we bring up the default Heritrix instance.
0402:             * @param jmxregister True if we are to register this instance with JMX
0403:             * agent.
0404:             * @param cjh CrawlJobHandler to use.
0405:             * @throws IOException
0406:             */
0407:            public Heritrix(final String name, final boolean jmxregister,
0408:                    final CrawlJobHandler cjh) throws IOException {
0409:                super ();
0410:                containerInitialization();
0411:                this .jobHandler = cjh;
0412:                this .openMBeanInfo = buildMBeanInfo();
0413:                // Set up the alerting system.  SinkHandler is also a global so will
0414:                // catch alerts for all running Heritrix instances.  Will need to
0415:                // address (Add name of instance that threw the alert to SinkRecord?).
0416:                final SinkHandler sinkHandler = SinkHandler.getInstance();
0417:                if (sinkHandler == null) {
0418:                    throw new NullPointerException("SinkHandler not found.");
0419:                }
0420:                // Adapt the alerting system to use SinkHandler.
0421:                this .alertManager = new AlertManager() {
0422:                    public void add(SinkHandlerLogRecord record) {
0423:                        sinkHandler.publish(record);
0424:                    }
0425:
0426:                    public Vector getAll() {
0427:                        return sinkHandler.getAll();
0428:                    }
0429:
0430:                    public Vector getNewAll() {
0431:                        return sinkHandler.getAllUnread();
0432:                    }
0433:
0434:                    public SinkHandlerLogRecord get(String alertID) {
0435:                        return sinkHandler.get(Long.parseLong(alertID));
0436:                    }
0437:
0438:                    public int getCount() {
0439:                        return sinkHandler.getCount();
0440:                    }
0441:
0442:                    public int getNewCount() {
0443:                        return sinkHandler.getUnreadCount();
0444:                    }
0445:
0446:                    public void remove(String alertID) {
0447:                        sinkHandler.remove(Long.parseLong(alertID));
0448:                    }
0449:
0450:                    public void read(String alertID) {
0451:                        sinkHandler.read(Long.parseLong(alertID));
0452:                    }
0453:                };
0454:
0455:                try {
0456:                    Heritrix.registerHeritrix(this , name, jmxregister);
0457:                } catch (InstanceAlreadyExistsException e) {
0458:                    throw new RuntimeException(e);
0459:                } catch (MBeanRegistrationException e) {
0460:                    throw new RuntimeException(e);
0461:                } catch (NotCompliantMBeanException e) {
0462:                    throw new RuntimeException(e);
0463:                } catch (MalformedObjectNameException e) {
0464:                    throw new RuntimeException(e);
0465:                }
0466:            }
0467:
0468:            /**
0469:             * Run setup tasks for this 'container'. Idempotent.
0470:             * 
0471:             * @throws IOException
0472:             */
0473:            protected static void containerInitialization() throws IOException {
0474:                if (Heritrix.containerInitialized) {
0475:                    return;
0476:                }
0477:                Heritrix.containerInitialized = true;
0478:                // Load up the properties.  This invocation adds heritrix properties
0479:                // to system properties so all available via System.getProperty.
0480:                // Note, loadProperties and patchLogging have global effects.  May be an
0481:                // issue if we're running inside a container such as tomcat or jboss.
0482:                Heritrix.loadProperties();
0483:                Heritrix.patchLogging();
0484:                Heritrix.configureTrustStore();
0485:                // Will run on SIGTERM but not on SIGKILL, unfortunately.
0486:                // Otherwise, ensures we cleanup after ourselves (Deregister from
0487:                // JMX and JNDI).
0488:                Runtime.getRuntime().addShutdownHook(
0489:                        Heritrix.getShutdownThread(false, 0,
0490:                                "Heritrix shutdown hook"));
0491:                // Register this heritrix 'container' though we may be inside another
0492:                // tomcat or jboss container.
0493:                try {
0494:                    registerContainerJndi();
0495:                } catch (Exception e) {
0496:                    logger.log(Level.WARNING,
0497:                            "Failed jndi container registration.", e);
0498:                }
0499:            }
0500:
0501:            /**
0502:             * Do inverse of construction. Used by anyone who does a 'new Heritrix' when
0503:             * they want to cleanup the instance.
0504:             * Of note, there may be Heritrix threads still hanging around after the
0505:             * call to destroy completes.  They'll eventually go down after they've
0506:             * finished their cleanup routines.  In particular, if you are watching
0507:             * Heritrix via JMX, you can see the Heritrix instance JMX bean unregister
0508:             * ahead of the CrawlJob JMX bean that its hosting.
0509:             */
0510:            public void destroy() {
0511:                stop();
0512:                try {
0513:                    Heritrix.unregisterHeritrix(this );
0514:                } catch (InstanceNotFoundException e) {
0515:                    e.printStackTrace();
0516:                } catch (MBeanRegistrationException e) {
0517:                    e.printStackTrace();
0518:                } catch (NullPointerException e) {
0519:                    e.printStackTrace();
0520:                }
0521:                this .jobHandler = null;
0522:                this .openMBeanInfo = null;
0523:            }
0524:
0525:            /**
0526:             * Launch program.
0527:             * Optionally will launch a web server to host UI.  Will also register
0528:             * Heritrix MBean with first found JMX Agent (Usually the 1.5.0 JVM
0529:             * Agent).
0530:             * 
0531:             * @param args Command line arguments.
0532:             * @throws Exception
0533:             */
0534:            public static void main(String[] args) throws Exception {
0535:                Heritrix.commandLine = true;
0536:
0537:                // Set timezone here.  Would be problematic doing it if we're running
0538:                // inside in a container.
0539:                TimeZone.setDefault(TimeZone.getTimeZone("GMT"));
0540:
0541:                File startLog = new File(getHeritrixHome(), STARTLOG);
0542:                Heritrix.out = new PrintWriter(isDevelopment() ? System.out
0543:                        : new PrintStream(new FileOutputStream(startLog)));
0544:
0545:                try {
0546:                    containerInitialization();
0547:                    String status = doCmdLineArgs(args);
0548:                    if (status != null) {
0549:                        Heritrix.out.println(status);
0550:                    }
0551:                }
0552:
0553:                catch (Exception e) {
0554:                    // Show any exceptions in STARTLOG.
0555:                    e.printStackTrace(Heritrix.out);
0556:                    throw e;
0557:                }
0558:
0559:                finally {
0560:                    // If not development, close the file that signals the wrapper
0561:                    // script that we've started.  Otherwise, just flush it; if in
0562:                    // development, the output is probably a console.
0563:                    if (!isDevelopment()) {
0564:                        if (Heritrix.out != null) {
0565:                            Heritrix.out.close();
0566:                        }
0567:                        System.out.println("Heritrix version: "
0568:                                + Heritrix.getVersion());
0569:                    } else {
0570:                        if (Heritrix.out != null) {
0571:                            Heritrix.out.flush();
0572:                        }
0573:                    }
0574:                }
0575:            }
0576:
0577:            protected static String doCmdLineArgs(final String[] args)
0578:                    throws Exception {
0579:                // Get defaults for commandline arguments from the properties file.
0580:                String tmpStr = PropertyUtils
0581:                        .getPropertyOrNull("heritrix.context");
0582:                if (tmpStr != null) {
0583:                    Heritrix.adminContext = tmpStr;
0584:                }
0585:                tmpStr = PropertyUtils
0586:                        .getPropertyOrNull("heritrix.cmdline.port");
0587:                if (tmpStr != null) {
0588:                    Heritrix.guiPort = Integer.parseInt(tmpStr);
0589:                }
0590:                tmpStr = PropertyUtils
0591:                        .getPropertyOrNull("heritrix.cmdline.admin");
0592:                String adminLoginPassword = (tmpStr == null) ? "" : tmpStr;
0593:                String crawlOrderFile = PropertyUtils
0594:                        .getPropertyOrNull("heritrix.cmdline.order");
0595:                tmpStr = PropertyUtils
0596:                        .getPropertyOrNull("heritrix.cmdline.run");
0597:                boolean runMode = PropertyUtils
0598:                        .getBooleanProperty("heritrix.cmdline.run");
0599:                boolean selfTest = false;
0600:                String selfTestName = null;
0601:                CommandLineParser clp = new CommandLineParser(args,
0602:                        Heritrix.out, Heritrix.getVersion());
0603:                List arguments = clp.getCommandLineArguments();
0604:                Option[] options = clp.getCommandLineOptions();
0605:
0606:                // Check passed argument.  Only one argument, the ORDER_FILE is allowed.
0607:                // If one argument, make sure exists and xml suffix.
0608:                if (arguments.size() > 1) {
0609:                    clp.usage(1);
0610:                } else if (arguments.size() == 1) {
0611:                    crawlOrderFile = (String) arguments.get(0);
0612:                    if (!(new File(crawlOrderFile).exists())) {
0613:                        clp.usage("ORDER.XML <" + crawlOrderFile
0614:                                + "> specified does not exist.", 1);
0615:                    }
0616:                    // Must end with '.xml'
0617:                    if (crawlOrderFile.length() > 4
0618:                            && !crawlOrderFile.substring(
0619:                                    crawlOrderFile.length() - 4)
0620:                                    .equalsIgnoreCase(".xml")) {
0621:                        clp.usage("ORDER.XML <" + crawlOrderFile
0622:                                + "> does not have required '.xml' suffix.", 1);
0623:                    }
0624:                }
0625:
0626:                // Now look at options passed.
0627:                for (int i = 0; i < options.length; i++) {
0628:                    switch (options[i].getId()) {
0629:                    case 'h':
0630:                        clp.usage();
0631:                        break;
0632:
0633:                    case 'a':
0634:                        adminLoginPassword = options[i].getValue();
0635:                        break;
0636:
0637:                    case 'n':
0638:                        if (crawlOrderFile == null) {
0639:                            clp.usage("You must specify an ORDER_FILE with"
0640:                                    + " '--nowui' option.", 1);
0641:                        }
0642:                        Heritrix.gui = false;
0643:                        break;
0644:
0645:                    case 'b':
0646:                        Heritrix.guiHosts = parseHosts(options[i].getValue());
0647:                        break;
0648:
0649:                    case 'p':
0650:                        try {
0651:                            Heritrix.guiPort = Integer.parseInt(options[i]
0652:                                    .getValue());
0653:                        } catch (NumberFormatException e) {
0654:                            clp.usage("Failed parse of port number: "
0655:                                    + options[i].getValue(), 1);
0656:                        }
0657:                        if (Heritrix.guiPort <= 0) {
0658:                            clp.usage("Nonsensical port number: "
0659:                                    + options[i].getValue(), 1);
0660:                        }
0661:                        break;
0662:
0663:                    case 'r':
0664:                        runMode = true;
0665:                        break;
0666:
0667:                    case 's':
0668:                        selfTestName = options[i].getValue();
0669:                        selfTest = true;
0670:                        break;
0671:
0672:                    default:
0673:                        assert false : options[i].getId();
0674:                    }
0675:                }
0676:
0677:                // Ok, we should now have everything to launch the program.
0678:                String status = null;
0679:                if (selfTest) {
0680:                    // If more than just '--selftest' and '--port' passed, then
0681:                    // there is confusion on what is being asked of us.  Print usage
0682:                    // rather than proceed.
0683:                    for (int i = 0; i < options.length; i++) {
0684:                        if (options[i].getId() != 'p'
0685:                                && options[i].getId() != 's') {
0686:                            clp.usage(1);
0687:                        }
0688:                    }
0689:
0690:                    if (arguments.size() > 0) {
0691:                        // No arguments accepted by selftest.
0692:                        clp.usage(1);
0693:                    }
0694:                    status = selftest(selfTestName, Heritrix.guiPort);
0695:                } else {
0696:                    if (!isValidLoginPasswordString(adminLoginPassword)) {
0697:                        clp.usage(
0698:                                "Invalid admin login:password value, or none "
0699:                                        + "specified. ", 1);
0700:                    }
0701:
0702:                    if (!Heritrix.gui) {
0703:                        if (options.length > 1) {
0704:                            // If more than just '--nowui' passed, then there is
0705:                            // confusion on what is being asked of us. Print usage
0706:                            // rather than proceed.
0707:                            clp.usage(1);
0708:                        }
0709:                        Heritrix h = new Heritrix(true);
0710:                        status = h.doOneCrawl(crawlOrderFile);
0711:                    } else {
0712:                        status = startEmbeddedWebserver(Heritrix.guiHosts,
0713:                                Heritrix.guiPort, adminLoginPassword);
0714:                        Heritrix h = new Heritrix(true);
0715:
0716:                        String tmp = h.launch(crawlOrderFile, runMode);
0717:                        if (tmp != null) {
0718:                            status += ('\n' + tmp);
0719:                        }
0720:                    }
0721:                }
0722:                return status;
0723:            }
0724:
0725:            /**
0726:             * @return The file we dump stdout and stderr into.
0727:             */
0728:            public static String getHeritrixOut() {
0729:                String tmp = System.getProperty("heritrix.out");
0730:                if (tmp == null || tmp.length() == 0) {
0731:                    tmp = Heritrix.DEFAULT_HERITRIX_OUT;
0732:                }
0733:                return tmp;
0734:            }
0735:
0736:            /**
0737:             * Exploit <code>-Dheritrix.home</code> if available to us.
0738:             * Is current working dir if no heritrix.home property supplied.
0739:             * @return Heritrix home directory.
0740:             * @throws IOException
0741:             */
0742:            protected static File getHeritrixHome() throws IOException {
0743:                File heritrixHome = null;
0744:                String home = System.getProperty("heritrix.home");
0745:                if (home != null && home.length() > 0) {
0746:                    heritrixHome = new File(home);
0747:                    if (!heritrixHome.exists()) {
0748:                        throw new IOException("HERITRIX_HOME <" + home
0749:                                + "> does not exist.");
0750:                    }
0751:                } else {
0752:                    heritrixHome = new File(new File("").getAbsolutePath());
0753:                }
0754:                return heritrixHome;
0755:            }
0756:
0757:            /**
0758:             * @return The directory into which we put jobs.  If the system property
0759:             * 'heritrix.jobsdir' is set, we will use its value in place of the default
0760:             * 'jobs' directory in the current working directory.
0761:             * @throws IOException
0762:             */
0763:            public static File getJobsdir() throws IOException {
0764:                Heritrix.loadProperties(); // if called in constructor
0765:                String jobsdirStr = System.getProperty("heritrix.jobsdir",
0766:                        "jobs");
0767:                File jobsdir = new File(jobsdirStr);
0768:                return (jobsdir.isAbsolute()) ? jobsdir : new File(
0769:                        getHeritrixHome(), jobsdirStr);
0770:            }
0771:
0772:            /**
0773:             * Get and check for existence of expected subdir.
0774:             *
0775:             * If development flag set, then look for dir under src dir.
0776:             *
0777:             * @param subdirName Dir to look for.
0778:             * @return The extant subdir.  Otherwise null if we're running
0779:             * in a webapp context where there is no conf directory available.
0780:             * @throws IOException if unable to find expected subdir.
0781:             */
0782:            protected static File getSubDir(String subdirName)
0783:                    throws IOException {
0784:                return getSubDir(subdirName, true);
0785:            }
0786:
0787:            /**
0788:             * Get and optionally check for existence of subdir.
0789:             *
0790:             * If development flag set, then look for dir under src dir.
0791:             *
0792:             * @param subdirName Dir to look for.
0793:             * @param fail True if we are to fail if directory does not
0794:             * exist; false if we are to return false if the directory does not exist.
0795:             * @return The extant subdir.  Otherwise null if we're running
0796:             * in a webapp context where there is no subdir directory available.
0797:             * @throws IOException if unable to find expected subdir.
0798:             */
0799:            protected static File getSubDir(String subdirName, boolean fail)
0800:                    throws IOException {
0801:                String path = isDevelopment() ? "src" + File.separator
0802:                        + subdirName : subdirName;
0803:                File dir = new File(getHeritrixHome(), path);
0804:                if (!dir.exists()) {
0805:                    if (fail) {
0806:                        throw new IOException("Cannot find subdir: "
0807:                                + subdirName);
0808:                    }
0809:                    dir = null;
0810:                }
0811:                return dir;
0812:            }
0813:
0814:            /**
0815:             * Test string is valid login/password string.
0816:             *
0817:             * A valid login/password string has the login and password compounded
0818:             * w/ a ':' delimiter.
0819:             *
0820:             * @param str String to test.
0821:             * @return True if valid password/login string.
0822:             */
0823:            protected static boolean isValidLoginPasswordString(String str) {
0824:                boolean isValid = false;
0825:                StringTokenizer tokenizer = new StringTokenizer(str, ":");
0826:                if (tokenizer.countTokens() == 2) {
0827:                    String login = ((String) tokenizer.nextElement()).trim();
0828:                    String password = ((String) tokenizer.nextElement()).trim();
0829:                    if (login.length() > 0 && password.length() > 0) {
0830:                        isValid = true;
0831:                    }
0832:                }
0833:                return isValid;
0834:            }
0835:
0836:            protected static boolean isDevelopment() {
0837:                return System.getProperty("heritrix.development") != null;
0838:            }
0839:
0840:            /**
0841:             * Load the heritrix.properties file.
0842:             * 
0843:             * Adds any property that starts with
0844:             * <code>HERITRIX_PROPERTIES_PREFIX</code>
0845:             * or <code>ARCHIVE_PACKAGE</code>
0846:             * into system properties (except logging '.level' directives).
0847:             * @return Loaded properties.
0848:             * @throws IOException
0849:             */
0850:            protected static Properties loadProperties() throws IOException {
0851:                if (Heritrix.propertiesLoaded) {
0852:                    return System.getProperties();
0853:                }
0854:                Heritrix.propertiesLoaded = true;
0855:
0856:                Properties properties = new Properties();
0857:                properties.load(getPropertiesInputStream());
0858:
0859:                // Any property that begins with ARCHIVE_PACKAGE, make it
0860:                // into a system property. While iterating, check to see if anything
0861:                // defined on command-line, and if so, it overrules whats in
0862:                // heritrix.properties.
0863:                for (Enumeration e = properties.keys(); e.hasMoreElements();) {
0864:                    String key = ((String) e.nextElement()).trim();
0865:                    if (key.startsWith(ARCHIVE_PACKAGE)
0866:                            || key.startsWith(HERITRIX_PROPERTIES_PREFIX)) {
0867:                        // Don't add the heritrix.properties entries that are
0868:                        // changing the logging level of particular classes.
0869:                        if (key.indexOf(".level") < 0) {
0870:                            if (System.getProperty(key) == null
0871:                                    || System.getProperty(key).length() == 0) {
0872:                                System.setProperty(key, properties.getProperty(
0873:                                        key).trim());
0874:                            }
0875:                        }
0876:                    }
0877:                }
0878:                return properties;
0879:            }
0880:
0881:            protected static InputStream getPropertiesInputStream()
0882:                    throws IOException {
0883:                File file = null;
0884:                // Look to see if properties have been passed on the cmd-line.
0885:                String alternateProperties = System.getProperty(PROPERTIES_KEY);
0886:                if (alternateProperties != null
0887:                        && alternateProperties.length() > 0) {
0888:                    file = new File(alternateProperties);
0889:                }
0890:                // Get properties from conf directory if one available.
0891:                if ((file == null || !file.exists())
0892:                        && getConfdir(false) != null) {
0893:                    file = new File(getConfdir(), PROPERTIES);
0894:                    if (!file.exists()) {
0895:                        // If no properties file in the conf dir, set file back to
0896:                        // null so we go looking for heritrix.properties on classpath.
0897:                        file = null;
0898:                    }
0899:                }
0900:                // If not on the command-line, there is no conf dir. Then get the
0901:                // properties from the CLASSPATH (Classpath file separator is always
0902:                // '/', whatever the platform.
0903:                InputStream is = (file != null) ? new FileInputStream(file)
0904:                        : Heritrix.class.getResourceAsStream("/"
0905:                                + PROPERTIES_KEY);
0906:                if (is == null) {
0907:                    throw new IOException("Failed to load properties file from"
0908:                            + " filesystem or from classpath.");
0909:                }
0910:                return is;
0911:            }
0912:
0913:            /**
0914:             * If the user hasn't altered the default logging parameters, tighten them
0915:             * up somewhat: some of our libraries are way too verbose at the INFO or
0916:             * WARNING levels.
0917:             * 
0918:             * This might be a problem running inside in someone else's
0919:             * container.  Container's seem to prefer commons logging so we
0920:             * ain't messing them doing the below.
0921:             *
0922:             * @throws IOException
0923:             * @throws SecurityException
0924:             */
0925:            protected static void patchLogging() throws SecurityException,
0926:                    IOException {
0927:                if (System.getProperty("java.util.logging.config.class") != null) {
0928:                    return;
0929:                }
0930:
0931:                if (System.getProperty("java.util.logging.config.file") != null) {
0932:                    return;
0933:                }
0934:
0935:                // No user-set logging properties established; use defaults
0936:                // from distribution-packaged 'heritrix.properties'.
0937:                LogManager.getLogManager().readConfiguration(
0938:                        getPropertiesInputStream());
0939:            }
0940:
0941:            /**
0942:             * Configure our trust store.
0943:             *
0944:             * If system property is defined, then use it for our truststore.  Otherwise
0945:             * use the heritrix truststore under conf directory if it exists.
0946:             * 
0947:             * <p>If we're not launched from the command-line, we will not be able
0948:             * to find our truststore.  The truststore is nor normally used so rare
0949:             * should this be a problem (In case where we don't use find our trust
0950:             * store, we'll use the 'default' -- either the JVMs or the containers).
0951:             */
0952:            protected static void configureTrustStore() {
0953:                // Below must be defined in jsse somewhere but can' find it.
0954:                final String TRUSTSTORE_KEY = "javax.net.ssl.trustStore";
0955:                String value = System.getProperty(TRUSTSTORE_KEY);
0956:                File confdir = null;
0957:                try {
0958:                    confdir = getConfdir(false);
0959:                } catch (IOException e) {
0960:                    logger.log(Level.WARNING, "Failed to get confdir.", e);
0961:                }
0962:                if ((value == null || value.length() <= 0) && confdir != null) {
0963:                    // Use the heritrix store if it exists on disk.
0964:                    File heritrixStore = new File(confdir, "heritrix.cacerts");
0965:                    if (heritrixStore.exists()) {
0966:                        value = heritrixStore.getAbsolutePath();
0967:                    }
0968:                }
0969:
0970:                if (value != null && value.length() > 0) {
0971:                    System.setProperty(TRUSTSTORE_KEY, value);
0972:                }
0973:            }
0974:
0975:            /**
0976:             * Run the selftest
0977:             *
0978:             * @param oneSelfTestName Name of a test if we are to run one only rather
0979:             * than the default running all tests.
0980:             * @param port Port number to use for web UI.
0981:             *
0982:             * @exception Exception
0983:             * @return Status of how selftest startup went.
0984:             */
0985:            protected static String selftest(final String oneSelfTestName,
0986:                    final int port) throws Exception {
0987:                // Put up the webserver w/ the root and selftest webapps only.
0988:                final String SELFTEST = "selftest";
0989:                Heritrix.httpServer = new SimpleHttpServer(SELFTEST,
0990:                        Heritrix.adminContext, LOCALHOST_ONLY, port, true);
0991:                // Set up digest auth for a section of the server so selftest can run
0992:                // auth tests.  Looks like can only set one login realm going by the
0993:                // web.xml dtd.  Otherwise, would be nice to selftest basic and digest.
0994:                // Have login, password and role all be SELFTEST.  Must match what is
0995:                // in the selftest order.xml file.
0996:                Heritrix.httpServer.setAuthentication(SELFTEST,
0997:                        Heritrix.adminContext, SELFTEST, SELFTEST, SELFTEST);
0998:                Heritrix.httpServer.startServer();
0999:                // Get the order file from the CLASSPATH unless we're running in dev
1000:                // environment.
1001:                File selftestDir = (isDevelopment()) ? new File(getConfdir(),
1002:                        SELFTEST) : new File(File.separator + SELFTEST);
1003:                File crawlOrderFile = new File(selftestDir, "order.xml");
1004:                // Create a job based off the selftest order file.  Then use this as
1005:                // a template to pass jobHandler.newJob().  Doing this gets our
1006:                // selftest output to show under the jobs directory.
1007:                // Pass as a seed a pointer to the webserver we just put up.
1008:                final String ROOTURI = "127.0.0.1:" + Integer.toString(port);
1009:                String selfTestUrl = "http://" + ROOTURI + '/';
1010:                if (oneSelfTestName != null && oneSelfTestName.length() > 0) {
1011:                    selfTestUrl += (oneSelfTestName + '/');
1012:                }
1013:                CrawlJobHandler cjh = new SelfTestCrawlJobHandler(getJobsdir(),
1014:                        oneSelfTestName, selfTestUrl);
1015:                Heritrix h = new Heritrix("Selftest", true, cjh);
1016:                CrawlJob job = createCrawlJob(cjh, crawlOrderFile, "Template");
1017:                job = h.getJobHandler().newJob(job, null, SELFTEST,
1018:                        "Integration self test", selfTestUrl,
1019:                        CrawlJob.PRIORITY_CRITICAL);
1020:                h.getJobHandler().addJob(job);
1021:                // Before we start, need to change some items in the settings file.
1022:                CredentialStore cs = (CredentialStore) job.getSettingsHandler()
1023:                        .getOrder().getAttribute(CredentialStore.ATTR_NAME);
1024:                for (Iterator i = cs.iterator(null); i.hasNext();) {
1025:                    ((Credential) i.next()).setCredentialDomain(null, ROOTURI);
1026:                }
1027:                h.getJobHandler().startCrawler();
1028:                StringBuffer buffer = new StringBuffer();
1029:                buffer.append("Heritrix " + Heritrix.getVersion()
1030:                        + " selftest started.");
1031:                buffer.append("\nSelftest first crawls " + selfTestUrl
1032:                        + " and then runs an analysis.");
1033:                buffer.append("\nResult of analysis printed to "
1034:                        + getHeritrixOut() + " when done.");
1035:                buffer.append("\nSelftest job directory for logs and arcs:\n"
1036:                        + job.getDirectory().getAbsolutePath());
1037:                return buffer.toString();
1038:            }
1039:
1040:            /**
1041:             * Launch the crawler without a web UI and run the passed crawl only.
1042:             * 
1043:             * Specialized version of {@link #launch()}.
1044:             *
1045:             * @param crawlOrderFile The crawl order to crawl.
1046:             * @throws InitializationException
1047:             * @throws InvalidAttributeValueException
1048:             * @return Status string.
1049:             */
1050:            protected String doOneCrawl(String crawlOrderFile)
1051:                    throws InitializationException,
1052:                    InvalidAttributeValueException {
1053:                return doOneCrawl(crawlOrderFile, null);
1054:            }
1055:
1056:            /**
1057:             * Launch the crawler without a web UI and run passed crawl only.
1058:             * 
1059:             * Specialized version of {@link #launch()}.
1060:             *
1061:             * @param crawlOrderFile The crawl order to crawl.
1062:             * @param listener Register this crawl status listener before starting
1063:             * crawl (You can use this listener to notice end-of-crawl).
1064:             * @throws InitializationException
1065:             * @throws InvalidAttributeValueException
1066:             * @return Status string.
1067:             */
1068:            protected String doOneCrawl(String crawlOrderFile,
1069:                    CrawlStatusListener listener)
1070:                    throws InitializationException,
1071:                    InvalidAttributeValueException {
1072:                XMLSettingsHandler handler = new XMLSettingsHandler(new File(
1073:                        crawlOrderFile));
1074:                handler.initialize();
1075:                CrawlController controller = new CrawlController();
1076:                controller.initialize(handler);
1077:                if (listener != null) {
1078:                    controller.addCrawlStatusListener(listener);
1079:                }
1080:                controller.requestCrawlStart();
1081:                return "Crawl started using " + crawlOrderFile + ".";
1082:            }
1083:
1084:            /**
1085:             * Launch the crawler for a web UI.
1086:             *
1087:             * Crawler hangs around waiting on jobs.
1088:             *
1089:             * @exception Exception
1090:             * @return A status string describing how the launch went.
1091:             * @throws Exception
1092:             */
1093:            public String launch() throws Exception {
1094:                return launch(null, false);
1095:            }
1096:
1097:            /**
1098:             * Launch the crawler for a web UI.
1099:             *
1100:             * Crawler hangs around waiting on jobs.
1101:             * 
1102:             * @param crawlOrderFile File to crawl.  May be null.
1103:             * @param runMode Whether crawler should be set to run mode.
1104:             *
1105:             * @exception Exception
1106:             * @return A status string describing how the launch went.
1107:             */
1108:            public String launch(String crawlOrderFile, boolean runMode)
1109:                    throws Exception {
1110:                String status = null;
1111:                if (crawlOrderFile != null) {
1112:                    addCrawlJob(crawlOrderFile, "Autolaunched", "", "");
1113:                    if (runMode) {
1114:                        this .jobHandler.startCrawler();
1115:                        status = "Job being crawled: " + crawlOrderFile;
1116:                    } else {
1117:                        status = "Crawl job ready and pending: "
1118:                                + crawlOrderFile;
1119:                    }
1120:                } else if (runMode) {
1121:                    // The use case is that jobs are to be run on a schedule and that
1122:                    // if the crawler is in run mode, then the scheduled job will be
1123:                    // run at appropriate time.  Otherwise, not.
1124:                    this .jobHandler.startCrawler();
1125:                    status = "Crawler set to run mode.";
1126:                }
1127:                return status;
1128:            }
1129:
1130:            /**
1131:             * Start up the embedded Jetty webserver instance.
1132:             * This is done when we're run from the command-line.
1133:             * @param port Port number to use for web UI.
1134:             * @param adminLoginPassword Compound of login and password.
1135:             * @throws Exception
1136:             * @return Status on webserver startup.
1137:             * @deprecated  Use startEmbeddedWebserver(hosts, port, adminLoginPassword)
1138:             */
1139:            protected static String startEmbeddedWebserver(final int port,
1140:                    final boolean lho, final String adminLoginPassword)
1141:                    throws Exception {
1142:                ArrayList<String> hosts = new ArrayList<String>();
1143:                if (lho) {
1144:                    hosts.add("127.0.0.1");
1145:                }
1146:                return startEmbeddedWebserver(hosts, port, adminLoginPassword);
1147:            }
1148:
1149:            /**
1150:             * Parses a list of host names.
1151:             * 
1152:             * <p>If the given string is <code>/</code>, then an empty
1153:             * collection is returned.  This indicates that all available network
1154:             * interfaces should be used.
1155:             * 
1156:             * <p>Otherwise, the string must contain a comma-separated list of 
1157:             * IP addresses or host names.  The parsed list is then returned.
1158:             * 
1159:             * @param hosts  the string to parse
1160:             * @return  the parsed collection of hosts 
1161:             */
1162:            private static Collection<String> parseHosts(String hosts) {
1163:                hosts = hosts.trim();
1164:                if (hosts.equals("/")) {
1165:                    return new ArrayList<String>(1);
1166:                }
1167:                String[] hostArray = hosts.split(",");
1168:                for (int i = 0; i < hostArray.length; i++) {
1169:                    hostArray[i] = hostArray[i].trim();
1170:                }
1171:                return Arrays.asList(hostArray);
1172:            }
1173:
1174:            /**
1175:             * Start up the embedded Jetty webserver instance.
1176:             * This is done when we're run from the command-line.
1177:             * 
1178:             * @param hosts  a list of IP addresses or hostnames to bind to, or an
1179:             *               empty collection to bind to all available network 
1180:             *               interfaces
1181:             * @param port Port number to use for web UI.
1182:             * @param adminLoginPassword Compound of login and password.
1183:             * @throws Exception
1184:             * @return Status on webserver startup.
1185:             */
1186:            protected static String startEmbeddedWebserver(
1187:                    Collection<String> hosts, int port,
1188:                    String adminLoginPassword) throws Exception {
1189:                adminUsername = adminLoginPassword.substring(0,
1190:                        adminLoginPassword.indexOf(":"));
1191:                adminPassword = adminLoginPassword.substring(adminLoginPassword
1192:                        .indexOf(":") + 1);
1193:                Heritrix.httpServer = new SimpleHttpServer("admin",
1194:                        Heritrix.adminContext, hosts, port, false);
1195:
1196:                final String DOTWAR = ".war";
1197:                final String SELFTEST = "selftest";
1198:
1199:                // Look for additional WAR files beyond 'selftest' and 'admin'.
1200:                File[] wars = getWarsdir().listFiles();
1201:                for (int i = 0; i < wars.length; i++) {
1202:                    if (wars[i].isFile()) {
1203:                        final String warName = wars[i].getName();
1204:                        final String warNameNC = warName.toLowerCase();
1205:                        if (warNameNC.endsWith(DOTWAR)
1206:                                && !warNameNC.equals(ADMIN + DOTWAR)
1207:                                && !warNameNC.equals(SELFTEST + DOTWAR)) {
1208:                            int dot = warName.indexOf('.');
1209:                            Heritrix.httpServer.addWebapp(warName.substring(0,
1210:                                    dot), null, true);
1211:                        }
1212:                    }
1213:                }
1214:
1215:                // Name of passed 'realm' must match what is in configured in web.xml.
1216:                // We'll use ROLE for 'realm' and 'role'.
1217:                final String ROLE = ADMIN;
1218:                Heritrix.httpServer.setAuthentication(ROLE,
1219:                        Heritrix.adminContext, adminUsername, adminPassword,
1220:                        ROLE);
1221:                Heritrix.httpServer.startServer();
1222:                StringBuffer buffer = new StringBuffer();
1223:                buffer.append("Heritrix " + Heritrix.getVersion()
1224:                        + " is running.");
1225:                for (String host : httpServer.getHosts()) {
1226:                    buffer.append("\nWeb console is at: http://");
1227:                    buffer.append(host).append(':').append(port);
1228:                }
1229:                buffer.append("\nWeb console login and password: "
1230:                        + adminUsername + "/" + adminPassword);
1231:                return buffer.toString();
1232:            }
1233:
1234:            /**
1235:             * Replace existing administrator login info with new info.
1236:             * 
1237:             * @param newUsername new administrator login username
1238:             * @param newPassword new administrator login password
1239:             */
1240:            public static void resetAuthentication(String newUsername,
1241:                    String newPassword) {
1242:                Heritrix.httpServer.resetAuthentication(ADMIN, adminUsername,
1243:                        newUsername, newPassword);
1244:                adminUsername = newUsername;
1245:                adminPassword = newPassword;
1246:                logger.info("administrative login changed to " + newUsername
1247:                        + ":" + newPassword);
1248:            }
1249:
1250:            protected static CrawlJob createCrawlJob(CrawlJobHandler handler,
1251:                    File crawlOrderFile, String name)
1252:                    throws InvalidAttributeValueException {
1253:                XMLSettingsHandler settings = new XMLSettingsHandler(
1254:                        crawlOrderFile);
1255:                settings.initialize();
1256:                return new CrawlJob(handler.getNextJobUID(), name, settings,
1257:                        new CrawlJobErrorHandler(Level.SEVERE),
1258:                        CrawlJob.PRIORITY_HIGH, crawlOrderFile
1259:                                .getAbsoluteFile().getParentFile());
1260:            }
1261:
1262:            /**
1263:             * This method is called when we have an order file to hand that we want
1264:             * to base a job on.  It leaves the order file in place and just starts up
1265:             * a job that uses all the order points to for locations for logs, etc.
1266:             * @param orderPathOrUrl Path to an order file or to a seeds file.
1267:             * @param name Name to use for this job.
1268:             * @param description 
1269:             * @param seeds 
1270:             * @return A status string.
1271:             * @throws IOException 
1272:             * @throws FatalConfigurationException 
1273:             */
1274:            public String addCrawlJob(String orderPathOrUrl, String name,
1275:                    String description, String seeds) throws IOException,
1276:                    FatalConfigurationException {
1277:                if (!UURI.hasScheme(orderPathOrUrl)) {
1278:                    // Assume its a file path.
1279:                    return addCrawlJob(new File(orderPathOrUrl), name,
1280:                            description, seeds);
1281:                }
1282:
1283:                // Otherwise, must be an URL.
1284:                URL url = new URL(orderPathOrUrl);
1285:
1286:                // Handle http and file only for now (Tried to handle JarUrlConnection
1287:                // but too awkward undoing jar stream.  Rather just look for URLs that
1288:                // end in '.jar').
1289:                String result = null;
1290:                URLConnection connection = url.openConnection();
1291:                if (connection instanceof  HttpURLConnection) {
1292:                    result = addCrawlJob(url, (HttpURLConnection) connection,
1293:                            name, description, seeds);
1294:                } else if (connection instanceof  FileURLConnection) {
1295:                    result = addCrawlJob(new File(url.getPath()), name,
1296:                            description, seeds);
1297:                } else {
1298:                    throw new UnsupportedOperationException("No support for "
1299:                            + connection);
1300:                }
1301:
1302:                return result;
1303:            }
1304:
1305:            protected String addCrawlJob(final URL url,
1306:                    final HttpURLConnection connection, final String name,
1307:                    final String description, final String seeds)
1308:                    throws IOException, FatalConfigurationException {
1309:                // Look see if its a jar file.  If it is undo it.
1310:                boolean isJar = url.getPath() != null
1311:                        && url.getPath().toLowerCase().endsWith(JAR_SUFFIX);
1312:                // If http url connection, bring down the resource local.
1313:                File localFile = File.createTempFile(Heritrix.class.getName(),
1314:                        isJar ? JAR_SUFFIX : null, TMPDIR);
1315:                connection.connect();
1316:                String result = null;
1317:                try {
1318:                    IoUtils.readFullyToFile(connection.getInputStream(),
1319:                            localFile);
1320:                    result = addCrawlJob(localFile, name, description, seeds);
1321:                } catch (IOException ioe) {
1322:                    // Cleanup if an Exception.
1323:                    localFile.delete();
1324:                    localFile = null;
1325:                } finally {
1326:                    connection.disconnect();
1327:                    // If its a jar file, then we made a job based on the jar contents.
1328:                    // Its no longer needed.  Remove it.  If not a jar file, then leave
1329:                    // the file around because the job depends on it.
1330:                    if (isJar && localFile != null && localFile.exists()) {
1331:                        localFile.delete();
1332:                    }
1333:                }
1334:                return result;
1335:            }
1336:
1337:            protected String addCrawlJob(final File order, final String name,
1338:                    final String description, final String seeds)
1339:                    throws FatalConfigurationException, IOException {
1340:                CrawlJob addedJob = null;
1341:                if (this .jobHandler == null) {
1342:                    throw new NullPointerException(
1343:                            "Heritrix jobhandler is null.");
1344:                }
1345:                try {
1346:                    if (order.getName().toLowerCase().endsWith(JAR_SUFFIX)) {
1347:                        return addCrawlJobBasedonJar(order, name, description,
1348:                                seeds);
1349:                    }
1350:                    addedJob = this .jobHandler.addJob(createCrawlJob(
1351:                            this .jobHandler, order, name));
1352:                } catch (InvalidAttributeValueException e) {
1353:                    FatalConfigurationException fce = new FatalConfigurationException(
1354:                            "Converted InvalidAttributeValueException on "
1355:                                    + order.getAbsolutePath() + ": "
1356:                                    + e.getMessage());
1357:                    fce.setStackTrace(e.getStackTrace());
1358:                }
1359:                return addedJob != null ? addedJob.getUID() : null;
1360:            }
1361:
1362:            /**
1363:             * Undo jar file and use as basis for a new job.
1364:             * @param jarFile Pointer to file that holds jar.
1365:             * @param name Name to use for new job.
1366:             * @param description 
1367:             * @param seeds 
1368:             * @return Message.
1369:             * @throws IOException
1370:             * @throws FatalConfigurationException
1371:             */
1372:            protected String addCrawlJobBasedonJar(final File jarFile,
1373:                    final String name, final String description,
1374:                    final String seeds) throws IOException,
1375:                    FatalConfigurationException {
1376:                if (jarFile == null || !jarFile.exists()) {
1377:                    throw new FileNotFoundException(jarFile.getAbsolutePath());
1378:                }
1379:                // Create a directory with a tmp name.  Do it by first creating file,
1380:                // removing it, then creating the directory. There is a hole during
1381:                // which the OS may put a file of same exact name in our way but
1382:                // unlikely.
1383:                File dir = File.createTempFile(Heritrix.class.getName(),
1384:                        ".expandedjar", TMPDIR);
1385:                dir.delete();
1386:                dir.mkdir();
1387:                try {
1388:                    org.archive.crawler.util.IoUtils.unzip(jarFile, dir);
1389:                    // Expect to find an order file at least.
1390:                    File orderFile = new File(dir, "order.xml");
1391:                    if (!orderFile.exists()) {
1392:                        throw new IOException("Missing order: "
1393:                                + orderFile.getAbsolutePath());
1394:                    }
1395:                    CrawlJob job = createCrawlJobBasedOn(orderFile, name,
1396:                            description, seeds);
1397:                    // Copy into place any seeds and settings directories before we
1398:                    // add job to Heritrix to crawl.
1399:                    File seedsFile = new File(dir, "seeds.txt");
1400:                    if (seedsFile.exists()) {
1401:                        FileUtils.copyFiles(seedsFile, new File(job
1402:                                .getDirectory(), seedsFile.getName()));
1403:                    }
1404:                    File settingsDir = new File(dir, "settings");
1405:                    if (settingsDir.exists()) {
1406:                        FileUtils.copyFiles(settingsDir, job.getDirectory());
1407:                    }
1408:                    addCrawlJob(job);
1409:                    return job.getUID();
1410:                } finally {
1411:                    // After job has been added, no more need of expanded content.
1412:                    // (Let the caller be responsible for cleanup of jar. Sometimes
1413:                    // its should be deleted -- when its a local copy of a jar pulled
1414:                    // across the net -- wherease other times, if its a jar passed
1415:                    // in w/ a 'file' scheme, it shouldn't be deleted.
1416:                    org.archive.util.FileUtils.deleteDir(dir);
1417:                }
1418:            }
1419:
1420:            public String addCrawlJobBasedOn(String jobUidOrProfile,
1421:                    String name, String description, String seeds) {
1422:                try {
1423:                    CrawlJob cj = getJobHandler().getJob(jobUidOrProfile);
1424:                    if (cj == null) {
1425:                        throw new InvalidAttributeValueException(
1426:                                jobUidOrProfile
1427:                                        + " is not a job UID or profile name (Job UIDs are "
1428:                                        + " usually the 14 digit date portion of job name).");
1429:                    }
1430:                    CrawlJob job = addCrawlJobBasedOn(cj.getSettingsHandler()
1431:                            .getOrderFile(), name, description, seeds);
1432:                    return job.getUID();
1433:                } catch (Exception e) {
1434:                    e.printStackTrace();
1435:                    return "Exception on " + jobUidOrProfile + ": "
1436:                            + e.getMessage();
1437:                }
1438:            }
1439:
1440:            protected CrawlJob addCrawlJobBasedOn(final File orderFile,
1441:                    final String name, final String description,
1442:                    final String seeds) throws FatalConfigurationException {
1443:                return addCrawlJob(createCrawlJobBasedOn(orderFile, name,
1444:                        description, seeds));
1445:            }
1446:
1447:            protected CrawlJob createCrawlJobBasedOn(final File orderFile,
1448:                    final String name, final String description,
1449:                    final String seeds) throws FatalConfigurationException {
1450:                CrawlJob job = getJobHandler().newJob(orderFile, name,
1451:                        description, seeds);
1452:                return CrawlJobHandler.ensureNewJobWritten(job, name,
1453:                        description);
1454:            }
1455:
1456:            protected CrawlJob addCrawlJob(final CrawlJob job) {
1457:                return getJobHandler().addJob(job);
1458:            }
1459:
1460:            public void startCrawling() {
1461:                if (getJobHandler() == null) {
1462:                    throw new NullPointerException(
1463:                            "Heritrix jobhandler is null.");
1464:                }
1465:                getJobHandler().startCrawler();
1466:            }
1467:
1468:            public void stopCrawling() {
1469:                if (getJobHandler() == null) {
1470:                    throw new NullPointerException(
1471:                            "Heritrix jobhandler is null.");
1472:                }
1473:                getJobHandler().stopCrawler();
1474:            }
1475:
1476:            /**
1477:             * Get the heritrix version.
1478:             *
1479:             * @return The heritrix version.  May be null.
1480:             */
1481:            public static String getVersion() {
1482:                return System.getProperty("heritrix.version");
1483:            }
1484:
1485:            /**
1486:             * Get the job handler
1487:             *
1488:             * @return The CrawlJobHandler being used.
1489:             */
1490:            public CrawlJobHandler getJobHandler() {
1491:                return this .jobHandler;
1492:            }
1493:
1494:            /**
1495:             * Get the configuration directory.
1496:             * @return The conf directory under HERITRIX_HOME or null if none can
1497:             * be found.
1498:             * @throws IOException
1499:             */
1500:            public static File getConfdir() throws IOException {
1501:                return getConfdir(true);
1502:            }
1503:
1504:            /**
1505:             * Get the configuration directory.
1506:             * @param fail Throw IOE if can't find directory if true, else just
1507:             * return null.
1508:             * @return The conf directory under HERITRIX_HOME or null (or an IOE) if
1509:             * can't be found.
1510:             * @throws IOException
1511:             */
1512:            public static File getConfdir(final boolean fail)
1513:                    throws IOException {
1514:                final String key = "heritrix.conf";
1515:                // Look to see if heritrix.conf property passed on the cmd-line.
1516:                String tmp = System.getProperty(key);
1517:                // if not fall back to default $HERITIX_HOME/conf
1518:                if (tmp == null || tmp.length() == 0) {
1519:                    return getSubDir("conf", fail);
1520:                }
1521:                File dir = new File(tmp);
1522:                if (!dir.exists()) {
1523:                    if (fail) {
1524:                        throw new IOException("Cannot find conf dir: " + tmp);
1525:                    } else {
1526:                        logger
1527:                                .log(
1528:                                        Level.WARNING,
1529:                                        "Specified "
1530:                                                + key
1531:                                                + " dir does not exist.  Falling back on default");
1532:                    }
1533:                    dir = getSubDir("conf", fail);
1534:                }
1535:                return dir;
1536:            }
1537:
1538:            /**
1539:             * @return Returns the httpServer. May be null if one was not started.
1540:             */
1541:            public static SimpleHttpServer getHttpServer() {
1542:                return Heritrix.httpServer;
1543:            }
1544:
1545:            /**
1546:             * @throws IOException
1547:             * @return Returns the directory under which reside the WAR files
1548:             * we're to load into the servlet container.
1549:             */
1550:            public static File getWarsdir() throws IOException {
1551:                return getSubDir("webapps");
1552:            }
1553:
1554:            /**
1555:             * Prepars for program shutdown. This method does it's best to prepare the
1556:             * program so that it can exit normally. It will kill the httpServer and
1557:             * terminate any running job.<br>
1558:             * It is advisible to wait a few (~1000) millisec after calling this method
1559:             * and before calling performHeritrixShutDown() to allow as many threads as
1560:             * possible to finish what they are doing.
1561:             */
1562:            public static void prepareHeritrixShutDown() {
1563:                // Stop and destroy all running Heritrix instances.
1564:                // Get array of the key set to avoid CCEs for case where call to
1565:                // destroy does a remove of an instance from Heritrix.instances.
1566:                final Object[] keys = Heritrix.instances.keySet().toArray();
1567:                for (int i = 0; i < keys.length; i++) {
1568:                    ((Heritrix) Heritrix.instances.get(keys[i])).destroy();
1569:                }
1570:
1571:                try {
1572:                    deregisterJndi(getJndiContainerName());
1573:                } catch (NameNotFoundException e) {
1574:                    // We were probably unbound already. Ignore.
1575:                    logger.log(Level.WARNING, "deregistration of jndi", e);
1576:                } catch (Exception e) {
1577:                    e.printStackTrace();
1578:                }
1579:
1580:                if (Heritrix.httpServer != null) {
1581:                    // Shut down the web access.
1582:                    try {
1583:                        Heritrix.httpServer.stopServer();
1584:                    } catch (InterruptedException e) {
1585:                        // Generally this can be ignored, but we'll print a stack trace
1586:                        // just in case.
1587:                        e.printStackTrace();
1588:                    } finally {
1589:                        Heritrix.httpServer = null;
1590:                    }
1591:                }
1592:            }
1593:
1594:            /**
1595:             * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1596:             * prior to this method.
1597:             */
1598:            public static void performHeritrixShutDown() {
1599:                performHeritrixShutDown(0);
1600:            }
1601:
1602:            /**
1603:             * Exit program. Recommended that prepareHeritrixShutDown() be invoked
1604:             * prior to this method.
1605:             *
1606:             * @param exitCode Code to pass System.exit.
1607:             *
1608:             */
1609:            public static void performHeritrixShutDown(int exitCode) {
1610:                System.exit(exitCode);
1611:            }
1612:
1613:            /**
1614:             * Shutdown all running heritrix instances and the JVM.
1615:             * Assumes stop has already been called.
1616:             * @param exitCode Exit code to pass system exit.
1617:             */
1618:            public static void shutdown(final int exitCode) {
1619:                getShutdownThread(true, exitCode, "Heritrix shutdown").start();
1620:            }
1621:
1622:            protected static Thread getShutdownThread(final boolean sysexit,
1623:                    final int exitCode, final String name) {
1624:                Thread t = new Thread(name) {
1625:                    public void run() {
1626:                        Heritrix.prepareHeritrixShutDown();
1627:                        if (sysexit) {
1628:                            Heritrix.performHeritrixShutDown(exitCode);
1629:                        }
1630:                    }
1631:                };
1632:                t.setDaemon(true);
1633:                return t;
1634:            }
1635:
1636:            public static void shutdown() {
1637:                shutdown(0);
1638:            }
1639:
1640:            /**
1641:             * Register Heritrix with JNDI, JMX, and with the static hashtable of all
1642:             * Heritrix instances known to this JVM.
1643:             * 
1644:             * If launched from cmdline, register Heritrix MBean if an agent to register
1645:             * ourselves with. Usually this method will only have effect if we're
1646:             * running in a 1.5.0 JDK and command line options such as
1647:             * '-Dcom.sun.management.jmxremote.port=8082
1648:             * -Dcom.sun.management.jmxremote.authenticate=false
1649:             * -Dcom.sun.management.jmxremote.ssl=false' are supplied.
1650:             * See <a href="http://java.sun.com/j2se/1.5.0/docs/guide/management/agent.html">Monitoring
1651:             * and Management Using JMX</a>
1652:             * for more on the command line options and how to connect to the
1653:             * Heritrix bean using the JDK 1.5.0 jconsole tool.  We register currently
1654:             * with first server we find (TODO: Make configurable).
1655:             * 
1656:             * <p>If we register successfully with a JMX agent, then part of the
1657:             * registration will include our registering ourselves with JNDI.
1658:             * 
1659:             * <p>Finally, add the heritrix instance to the hashtable of all the
1660:             * Heritrix instances floating in the current VM.  This latter registeration
1661:             * happens whether or no there is a JMX agent to register with.  This is
1662:             * a list we keep out of convenience so its easy iterating over all
1663:             *  all instances calling stop when main application is going down.
1664:             * 
1665:             * @param h Instance of heritrix to register.
1666:             * @param name Name to use for this Heritrix instance.
1667:             * @param jmxregister True if we are to register this instance with JMX.
1668:             * @throws NullPointerException
1669:             * @throws MalformedObjectNameException
1670:             * @throws NotCompliantMBeanException 
1671:             * @throws MBeanRegistrationException 
1672:             * @throws InstanceAlreadyExistsException 
1673:             */
1674:            protected static void registerHeritrix(final Heritrix h,
1675:                    final String name, final boolean jmxregister)
1676:                    throws MalformedObjectNameException,
1677:                    InstanceAlreadyExistsException, MBeanRegistrationException,
1678:                    NotCompliantMBeanException {
1679:                MBeanServer server = getMBeanServer();
1680:                if (server != null) {
1681:                    // Are we to manage the jmx registration?  Or is it being done for
1682:                    // us by an external process: e.g. This instance was created by
1683:                    // MBeanAgent.
1684:                    if (jmxregister) {
1685:                        ObjectName objName = (name == null || name.length() <= 0) ? getJmxObjectName()
1686:                                : getJmxObjectName(name);
1687:                        registerMBean(server, h, objName);
1688:                    }
1689:                } else {
1690:                    // JMX ain't available. Put this instance into the list of Heritrix
1691:                    // instances so findable by the UI (Normally this is done in the
1692:                    // JMX postRegister routine below).  When no JMX, can only have
1693:                    // one instance of Heritrix so no need to do the deregisteration.
1694:                    Heritrix.instances.put(h.getNoJmxName(), h);
1695:                }
1696:            }
1697:
1698:            protected static void unregisterHeritrix(final Heritrix h)
1699:                    throws InstanceNotFoundException,
1700:                    MBeanRegistrationException, NullPointerException {
1701:                MBeanServer server = getMBeanServer();
1702:                if (server != null) {
1703:                    server.unregisterMBean(h.mbeanName);
1704:                } else {
1705:                    // JMX ain't available. Remove from list of Heritrix instances.
1706:                    // Usually this is done by the JMX postDeregister below.
1707:                    Heritrix.instances.remove(h.getNoJmxName());
1708:                }
1709:            }
1710:
1711:            /**
1712:             * Get MBeanServer.
1713:             * Currently uses first MBeanServer found.  This will definetly not be whats
1714:             * always wanted. TODO: Make which server settable. Also, if none, put up
1715:             * our own MBeanServer.
1716:             * @return An MBeanServer to register with or null.
1717:             */
1718:            public static MBeanServer getMBeanServer() {
1719:                MBeanServer result = null;
1720:                List servers = MBeanServerFactory.findMBeanServer(null);
1721:                if (servers == null) {
1722:                    return result;
1723:                }
1724:                for (Iterator i = servers.iterator(); i.hasNext();) {
1725:                    MBeanServer server = (MBeanServer) i.next();
1726:                    if (server == null) {
1727:                        continue;
1728:                    }
1729:                    result = server;
1730:                    break;
1731:                }
1732:                return result;
1733:            }
1734:
1735:            public static MBeanServer registerMBean(final Object objToRegister,
1736:                    final String name, final String type)
1737:                    throws InstanceAlreadyExistsException,
1738:                    MBeanRegistrationException, NotCompliantMBeanException {
1739:                MBeanServer server = getMBeanServer();
1740:                if (server != null) {
1741:                    server = registerMBean(server, objToRegister, name, type);
1742:                }
1743:                return server;
1744:            }
1745:
1746:            public static MBeanServer registerMBean(final MBeanServer server,
1747:                    final Object objToRegister, final String name,
1748:                    final String type) throws InstanceAlreadyExistsException,
1749:                    MBeanRegistrationException, NotCompliantMBeanException {
1750:                try {
1751:                    Hashtable<String, String> ht = new Hashtable<String, String>();
1752:                    ht.put(JmxUtils.NAME, name);
1753:                    ht.put(JmxUtils.TYPE, type);
1754:                    registerMBean(server, objToRegister, new ObjectName(
1755:                            CRAWLER_PACKAGE, ht));
1756:                } catch (MalformedObjectNameException e) {
1757:                    e.printStackTrace();
1758:                }
1759:                return server;
1760:            }
1761:
1762:            public static MBeanServer registerMBean(final MBeanServer server,
1763:                    final Object objToRegister, final ObjectName objName)
1764:                    throws InstanceAlreadyExistsException,
1765:                    MBeanRegistrationException, NotCompliantMBeanException {
1766:                server.registerMBean(objToRegister, objName);
1767:                return server;
1768:            }
1769:
1770:            public static void unregisterMBean(final MBeanServer server,
1771:                    final String name, final String type) {
1772:                if (server == null) {
1773:                    return;
1774:                }
1775:                try {
1776:                    unregisterMBean(server, getJmxObjectName(name, type));
1777:                } catch (MalformedObjectNameException e) {
1778:                    e.printStackTrace();
1779:                }
1780:            }
1781:
1782:            public static void unregisterMBean(final MBeanServer server,
1783:                    final ObjectName name) {
1784:                try {
1785:                    server.unregisterMBean(name);
1786:                    logger.info("Unregistered bean " + name.getCanonicalName());
1787:                } catch (InstanceNotFoundException e) {
1788:                    e.printStackTrace();
1789:                } catch (MBeanRegistrationException e) {
1790:                    e.printStackTrace();
1791:                } catch (NullPointerException e) {
1792:                    e.printStackTrace();
1793:                }
1794:            }
1795:
1796:            /**
1797:             * @return Name to use when no JMX agent available.
1798:             */
1799:            protected String getNoJmxName() {
1800:                return this .getClass().getName();
1801:            }
1802:
1803:            public static ObjectName getJmxObjectName()
1804:                    throws MalformedObjectNameException, NullPointerException {
1805:                return getJmxObjectName("Heritrix", JmxUtils.SERVICE);
1806:            }
1807:
1808:            public static ObjectName getJmxObjectName(final String name)
1809:                    throws MalformedObjectNameException, NullPointerException {
1810:                return getJmxObjectName(name, JmxUtils.SERVICE);
1811:            }
1812:
1813:            public static ObjectName getJmxObjectName(final String name,
1814:                    final String type) throws MalformedObjectNameException,
1815:                    NullPointerException {
1816:                Hashtable<String, String> ht = new Hashtable<String, String>();
1817:                ht.put(JmxUtils.NAME, name);
1818:                ht.put(JmxUtils.TYPE, type);
1819:                return new ObjectName(CRAWLER_PACKAGE, ht);
1820:            }
1821:
1822:            /**
1823:             * @return Returns true if Heritrix was launched from the command line.
1824:             * (When launched from command line, we do stuff like put up a web server
1825:             * to manage our web interface and we register ourselves with the first
1826:             * available jmx agent).
1827:             */
1828:            public static boolean isCommandLine() {
1829:                return Heritrix.commandLine;
1830:            }
1831:
1832:            /**
1833:             * @return True if heritrix has been started.
1834:             */
1835:            public boolean isStarted() {
1836:                return this .jobHandler != null;
1837:            }
1838:
1839:            public String getStatus() {
1840:                StringBuffer buffer = new StringBuffer();
1841:                if (this .getJobHandler() != null) {
1842:                    buffer.append("isRunning=");
1843:                    buffer.append(this .getJobHandler().isRunning());
1844:                    buffer.append(" isCrawling=");
1845:                    buffer.append(this .getJobHandler().isCrawling());
1846:                    buffer.append(" alertCount=");
1847:                    buffer.append(getAlertsCount());
1848:                    buffer.append(" newAlertCount=");
1849:                    buffer.append(getNewAlertsCount());
1850:                    if (this .getJobHandler().isCrawling()) {
1851:                        buffer.append(" currentJob=");
1852:                        buffer.append(this .getJobHandler().getCurrentJob()
1853:                                .getJmxJobName());
1854:                    }
1855:                }
1856:                return buffer.toString();
1857:            }
1858:
1859:            // Alert methods.
1860:            public int getAlertsCount() {
1861:                return this .alertManager.getCount();
1862:            }
1863:
1864:            public int getNewAlertsCount() {
1865:                return this .alertManager.getNewCount();
1866:            }
1867:
1868:            public Vector getAlerts() {
1869:                return this .alertManager.getAll();
1870:            }
1871:
1872:            public Vector getNewAlerts() {
1873:                return this .alertManager.getNewAll();
1874:            }
1875:
1876:            public SinkHandlerLogRecord getAlert(final String id) {
1877:                return this .alertManager.get(id);
1878:            }
1879:
1880:            public void readAlert(final String id) {
1881:                this .alertManager.read(id);
1882:            }
1883:
1884:            public void removeAlert(final String id) {
1885:                this .alertManager.remove(id);
1886:            }
1887:
1888:            /**
1889:             * Start Heritrix.
1890:             * 
1891:             * Used by JMX and webapp initialization for starting Heritrix.
1892:             * Not by the cmdline launched Heritrix. Idempotent.
1893:             * If start is called by JMX, then new instance of Heritrix is automatically
1894:             * registered w/ JMX Agent.  If started by webapp, need to register the new
1895:             * Heritrix instance.
1896:             */
1897:            public void start() {
1898:                // Don't start if we've been launched from the command line.
1899:                // Don't start if already started.
1900:                if (!Heritrix.isCommandLine() && !isStarted()) {
1901:                    try {
1902:                        logger.info(launch());
1903:                    } catch (Exception e) {
1904:                        e.printStackTrace();
1905:                    }
1906:                }
1907:            }
1908:
1909:            /**
1910:             * Stop Heritrix.
1911:             * 
1912:             * Used by JMX and webapp initialization for stopping Heritrix.
1913:             */
1914:            public void stop() {
1915:                if (this .jobHandler != null) {
1916:                    this .jobHandler.stop();
1917:                }
1918:            }
1919:
1920:            public String interrupt(String threadName) {
1921:                String result = "Thread " + threadName + " not found";
1922:                ThreadGroup group = Thread.currentThread().getThreadGroup();
1923:                if (group == null) {
1924:                    return result;
1925:                }
1926:                // Back up to the root threadgroup before starting
1927:                // to iterate over threads.
1928:                ThreadGroup parent = null;
1929:                while ((parent = group.getParent()) != null) {
1930:                    group = parent;
1931:                }
1932:                // Do an array that is twice the size of active
1933:                // thread count.  That should be big enough.
1934:                final int max = group.activeCount() * 2;
1935:                Thread[] threads = new Thread[max];
1936:                int threadCount = group.enumerate(threads, true);
1937:                if (threadCount >= max) {
1938:                    logger.info("Some threads not found...array too small: "
1939:                            + max);
1940:                }
1941:                for (int j = 0; j < threadCount; j++) {
1942:                    if (threads[j].getName().equals(threadName)) {
1943:                        threads[j].interrupt();
1944:                        result = "Interrupt sent to " + threadName;
1945:                        break;
1946:                    }
1947:                }
1948:                return result;
1949:            }
1950:
1951:            // OpenMBean implementation.
1952:
1953:            /**
1954:             * Build up the MBean info for Heritrix main.
1955:             * @return Return created mbean info instance.
1956:             */
1957:            protected OpenMBeanInfoSupport buildMBeanInfo() {
1958:                OpenMBeanAttributeInfoSupport[] attributes = new OpenMBeanAttributeInfoSupport[Heritrix.ATTRIBUTE_LIST
1959:                        .size()];
1960:                OpenMBeanConstructorInfoSupport[] constructors = new OpenMBeanConstructorInfoSupport[1];
1961:                OpenMBeanOperationInfoSupport[] operations = new OpenMBeanOperationInfoSupport[Heritrix.OPERATION_LIST
1962:                        .size()];
1963:                MBeanNotificationInfo[] notifications = new MBeanNotificationInfo[0];
1964:
1965:                // Attributes.
1966:                attributes[0] = new OpenMBeanAttributeInfoSupport(
1967:                        Heritrix.STATUS_ATTR, "Short basic status message",
1968:                        SimpleType.STRING, true, false, false);
1969:                // Attributes.
1970:                attributes[1] = new OpenMBeanAttributeInfoSupport(
1971:                        Heritrix.VERSION_ATTR, "Heritrix version",
1972:                        SimpleType.STRING, true, false, false);
1973:                // Attributes.
1974:                attributes[2] = new OpenMBeanAttributeInfoSupport(
1975:                        Heritrix.ISRUNNING_ATTR,
1976:                        "Whether the crawler is running", SimpleType.BOOLEAN,
1977:                        true, false, false);
1978:                // Attributes.
1979:                attributes[3] = new OpenMBeanAttributeInfoSupport(
1980:                        Heritrix.ISCRAWLING_ATTR,
1981:                        "Whether the crawler is crawling", SimpleType.BOOLEAN,
1982:                        true, false, false);
1983:                // Attributes.
1984:                attributes[4] = new OpenMBeanAttributeInfoSupport(
1985:                        Heritrix.ALERTCOUNT_ATTR, "The number of alerts",
1986:                        SimpleType.INTEGER, true, false, false);
1987:                // Attributes.
1988:                attributes[5] = new OpenMBeanAttributeInfoSupport(
1989:                        Heritrix.NEWALERTCOUNT_ATTR,
1990:                        "The number of new alerts", SimpleType.INTEGER, true,
1991:                        false, false);
1992:                // Attributes.
1993:                attributes[6] = new OpenMBeanAttributeInfoSupport(
1994:                        Heritrix.CURRENTJOB_ATTR,
1995:                        "The name of the job currently being crawled",
1996:                        SimpleType.STRING, true, false, false);
1997:
1998:                // Constructors.
1999:                constructors[0] = new OpenMBeanConstructorInfoSupport(
2000:                        "HeritrixOpenMBean",
2001:                        "Constructs Heritrix OpenMBean instance ",
2002:                        new OpenMBeanParameterInfoSupport[0]);
2003:
2004:                // Operations.
2005:                operations[0] = new OpenMBeanOperationInfoSupport(
2006:                        Heritrix.START_OPER, "Start Heritrix instance", null,
2007:                        SimpleType.VOID, MBeanOperationInfo.ACTION);
2008:
2009:                operations[1] = new OpenMBeanOperationInfoSupport(
2010:                        Heritrix.STOP_OPER, "Stop Heritrix instance", null,
2011:                        SimpleType.VOID, MBeanOperationInfo.ACTION);
2012:
2013:                OpenMBeanParameterInfo[] args = new OpenMBeanParameterInfoSupport[1];
2014:                args[0] = new OpenMBeanParameterInfoSupport("threadName",
2015:                        "Name of thread to send interrupt", SimpleType.STRING);
2016:                operations[2] = new OpenMBeanOperationInfoSupport(
2017:                        Heritrix.INTERRUPT_OPER, "Send thread an interrupt "
2018:                                + "(Used debugging)", args, SimpleType.STRING,
2019:                        MBeanOperationInfo.ACTION_INFO);
2020:
2021:                operations[3] = new OpenMBeanOperationInfoSupport(
2022:                        Heritrix.START_CRAWLING_OPER, "Set Heritrix instance "
2023:                                + "into crawling mode", null, SimpleType.VOID,
2024:                        MBeanOperationInfo.ACTION);
2025:
2026:                operations[4] = new OpenMBeanOperationInfoSupport(
2027:                        Heritrix.STOP_CRAWLING_OPER, "Unset Heritrix instance "
2028:                                + " crawling mode", null, SimpleType.VOID,
2029:                        MBeanOperationInfo.ACTION);
2030:
2031:                args = new OpenMBeanParameterInfoSupport[4];
2032:                args[0] = new OpenMBeanParameterInfoSupport("pathOrURL",
2033:                        "Path/URL to order or jar of order+seed",
2034:                        SimpleType.STRING);
2035:                args[1] = new OpenMBeanParameterInfoSupport("name",
2036:                        "Basename for new job", SimpleType.STRING);
2037:                args[2] = new OpenMBeanParameterInfoSupport("description",
2038:                        "Description to save with new job", SimpleType.STRING);
2039:                args[3] = new OpenMBeanParameterInfoSupport("seeds",
2040:                        "Initial seed(s)", SimpleType.STRING);
2041:                operations[5] = new OpenMBeanOperationInfoSupport(
2042:                        Heritrix.ADD_CRAWL_JOB_OPER, "Add new crawl job", args,
2043:                        SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2044:
2045:                args = new OpenMBeanParameterInfoSupport[4];
2046:                args[0] = new OpenMBeanParameterInfoSupport("uidOrName",
2047:                        "Job UID or profile name", SimpleType.STRING);
2048:                args[1] = new OpenMBeanParameterInfoSupport("name",
2049:                        "Basename for new job", SimpleType.STRING);
2050:                args[2] = new OpenMBeanParameterInfoSupport("description",
2051:                        "Description to save with new job", SimpleType.STRING);
2052:                args[3] = new OpenMBeanParameterInfoSupport("seeds",
2053:                        "Initial seed(s)", SimpleType.STRING);
2054:                operations[6] = new OpenMBeanOperationInfoSupport(
2055:                        Heritrix.ADD_CRAWL_JOB_BASEDON_OPER,
2056:                        "Add a new crawl job based on passed Job UID or profile",
2057:                        args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2058:
2059:                args = new OpenMBeanParameterInfoSupport[1];
2060:                args[0] = new OpenMBeanParameterInfoSupport("UID", "Job UID",
2061:                        SimpleType.STRING);
2062:                operations[7] = new OpenMBeanOperationInfoSupport(
2063:                        DELETE_CRAWL_JOB_OPER, "Delete/stop this crawl job",
2064:                        args, SimpleType.VOID, MBeanOperationInfo.ACTION);
2065:
2066:                args = new OpenMBeanParameterInfoSupport[1];
2067:                args[0] = new OpenMBeanParameterInfoSupport("index",
2068:                        "Zero-based index into array of alerts",
2069:                        SimpleType.INTEGER);
2070:                operations[8] = new OpenMBeanOperationInfoSupport(
2071:                        Heritrix.ALERT_OPER, "Return alert at passed index",
2072:                        args, SimpleType.STRING, MBeanOperationInfo.ACTION_INFO);
2073:
2074:                try {
2075:                    this .jobCompositeType = new CompositeType("job",
2076:                            "Job attributes", JOB_KEYS,
2077:                            new String[] { "Job unique ID", "Job name",
2078:                                    "Job status" }, new OpenType[] {
2079:                                    SimpleType.STRING, SimpleType.STRING,
2080:                                    SimpleType.STRING });
2081:                    this .jobsTabularType = new TabularType("jobs",
2082:                            "List of jobs", this .jobCompositeType,
2083:                            new String[] { "uid" });
2084:                } catch (OpenDataException e) {
2085:                    // This should never happen.
2086:                    throw new RuntimeException(e);
2087:                }
2088:                operations[9] = new OpenMBeanOperationInfoSupport(
2089:                        Heritrix.PENDING_JOBS_OPER,
2090:                        "List of pending jobs (or null if none)", null,
2091:                        this .jobsTabularType, MBeanOperationInfo.INFO);
2092:                operations[10] = new OpenMBeanOperationInfoSupport(
2093:                        Heritrix.COMPLETED_JOBS_OPER,
2094:                        "List of completed jobs (or null if none)", null,
2095:                        this .jobsTabularType, MBeanOperationInfo.INFO);
2096:
2097:                args = new OpenMBeanParameterInfoSupport[2];
2098:                args[0] = new OpenMBeanParameterInfoSupport("uid",
2099:                        "Job unique ID", SimpleType.STRING);
2100:                args[1] = new OpenMBeanParameterInfoSupport("name",
2101:                        "Report name (e.g. crawl-report, etc.)",
2102:                        SimpleType.STRING);
2103:                operations[11] = new OpenMBeanOperationInfoSupport(
2104:                        Heritrix.CRAWLEND_REPORT_OPER,
2105:                        "Return crawl-end report", args, SimpleType.STRING,
2106:                        MBeanOperationInfo.ACTION_INFO);
2107:
2108:                operations[12] = new OpenMBeanOperationInfoSupport(
2109:                        Heritrix.SHUTDOWN_OPER, "Shutdown container", null,
2110:                        SimpleType.VOID, MBeanOperationInfo.ACTION);
2111:
2112:                args = new OpenMBeanParameterInfoSupport[2];
2113:                args[0] = new OpenMBeanParameterInfoSupport("level",
2114:                        "Log level: e.g. SEVERE, WARNING, etc.",
2115:                        SimpleType.STRING);
2116:                args[1] = new OpenMBeanParameterInfoSupport("message",
2117:                        "Log message", SimpleType.STRING);
2118:                operations[13] = new OpenMBeanOperationInfoSupport(
2119:                        Heritrix.LOG_OPER, "Add a log message", args,
2120:                        SimpleType.VOID, MBeanOperationInfo.ACTION);
2121:
2122:                operations[14] = new OpenMBeanOperationInfoSupport(
2123:                        Heritrix.DESTROY_OPER, "Destroy Heritrix instance",
2124:                        null, SimpleType.VOID, MBeanOperationInfo.ACTION);
2125:
2126:                operations[15] = new OpenMBeanOperationInfoSupport(
2127:                        Heritrix.TERMINATE_CRAWL_JOB_OPER,
2128:                        "Returns false if no current job", null,
2129:                        SimpleType.BOOLEAN, MBeanOperationInfo.ACTION);
2130:
2131:                operations[16] = new OpenMBeanOperationInfoSupport(
2132:                        Heritrix.REBIND_JNDI_OPER,
2133:                        "Rebinds this Heritrix with JNDI.", null,
2134:                        SimpleType.VOID, MBeanOperationInfo.ACTION);
2135:
2136:                // Build the info object.
2137:                return new OpenMBeanInfoSupport(this .getClass().getName(),
2138:                        "Heritrix Main OpenMBean", attributes, constructors,
2139:                        operations, notifications);
2140:            }
2141:
2142:            public Object getAttribute(String attribute_name)
2143:                    throws AttributeNotFoundException {
2144:                if (attribute_name == null) {
2145:                    throw new RuntimeOperationsException(
2146:                            new IllegalArgumentException(
2147:                                    "Attribute name cannot be null"),
2148:                            "Cannot call getAttribute with null attribute name");
2149:                }
2150:                if (!Heritrix.ATTRIBUTE_LIST.contains(attribute_name)) {
2151:                    throw new AttributeNotFoundException("Attribute "
2152:                            + attribute_name + " is unimplemented.");
2153:                }
2154:                // The pattern in the below is to match an attribute and when found
2155:                // do a return out of if clause.  Doing it this way, I can fall
2156:                // on to the AttributeNotFoundException for case where we've an
2157:                // attribute but no handler.
2158:                if (attribute_name.equals(STATUS_ATTR)) {
2159:                    return getStatus();
2160:                }
2161:                if (attribute_name.equals(VERSION_ATTR)) {
2162:                    return getVersion();
2163:                }
2164:
2165:                if (attribute_name.equals(ISRUNNING_ATTR)) {
2166:                    return new Boolean(this .getJobHandler().isRunning());
2167:                }
2168:                if (attribute_name.equals(ISCRAWLING_ATTR)) {
2169:                    return new Boolean(this .getJobHandler().isCrawling());
2170:                }
2171:                if (attribute_name.equals(ALERTCOUNT_ATTR)) {
2172:                    return new Integer(getAlertsCount());
2173:                }
2174:                if (attribute_name.equals(NEWALERTCOUNT_ATTR)) {
2175:                    return new Integer(getNewAlertsCount());
2176:                }
2177:                if (attribute_name.equals(CURRENTJOB_ATTR)) {
2178:                    if (this .getJobHandler().isCrawling()) {
2179:                        return this .getJobHandler().getCurrentJob()
2180:                                .getJmxJobName();
2181:                    }
2182:                    return null;
2183:                }
2184:                throw new AttributeNotFoundException("Attribute "
2185:                        + attribute_name + " not found.");
2186:            }
2187:
2188:            public void setAttribute(Attribute attribute)
2189:                    throws AttributeNotFoundException {
2190:                throw new AttributeNotFoundException(
2191:                        "No attribute can be set in " + "this MBean");
2192:            }
2193:
2194:            public AttributeList getAttributes(String[] attributeNames) {
2195:                if (attributeNames == null) {
2196:                    throw new RuntimeOperationsException(
2197:                            new IllegalArgumentException(
2198:                                    "attributeNames[] cannot be " + "null"),
2199:                            "Cannot call getAttributes with null attribute "
2200:                                    + "names");
2201:                }
2202:                AttributeList resultList = new AttributeList();
2203:                if (attributeNames.length == 0) {
2204:                    return resultList;
2205:                }
2206:                for (int i = 0; i < attributeNames.length; i++) {
2207:                    try {
2208:                        Object value = getAttribute(attributeNames[i]);
2209:                        resultList.add(new Attribute(attributeNames[i], value));
2210:                    } catch (Exception e) {
2211:                        e.printStackTrace();
2212:                    }
2213:                }
2214:                return (resultList);
2215:            }
2216:
2217:            public AttributeList setAttributes(AttributeList attributes) {
2218:                return new AttributeList(); // always empty
2219:            }
2220:
2221:            public Object invoke(final String operationName,
2222:                    final Object[] params, final String[] signature)
2223:                    throws ReflectionException {
2224:                if (operationName == null) {
2225:                    throw new RuntimeOperationsException(
2226:                            new IllegalArgumentException(
2227:                                    "Operation name cannot be null"),
2228:                            "Cannot call invoke with null operation name");
2229:                }
2230:                // The pattern in the below is to match an operation and when found
2231:                // do a return out of if clause.  Doing it this way, I can fall
2232:                // on to the MethodNotFoundException for case where we've an
2233:                // attribute but no handler.
2234:                if (operationName.equals(START_OPER)) {
2235:                    JmxUtils.checkParamsCount(START_OPER, params, 0);
2236:                    start();
2237:                    return null;
2238:                }
2239:                if (operationName.equals(STOP_OPER)) {
2240:                    JmxUtils.checkParamsCount(STOP_OPER, params, 0);
2241:                    stop();
2242:                    return null;
2243:                }
2244:                if (operationName.equals(DESTROY_OPER)) {
2245:                    JmxUtils.checkParamsCount(DESTROY_OPER, params, 0);
2246:                    destroy();
2247:                    return null;
2248:                }
2249:                if (operationName.equals(TERMINATE_CRAWL_JOB_OPER)) {
2250:                    JmxUtils.checkParamsCount(TERMINATE_CRAWL_JOB_OPER, params,
2251:                            0);
2252:                    return new Boolean(this .jobHandler.terminateCurrentJob());
2253:                }
2254:                if (operationName.equals(REBIND_JNDI_OPER)) {
2255:                    JmxUtils.checkParamsCount(REBIND_JNDI_OPER, params, 0);
2256:                    try {
2257:                        registerContainerJndi();
2258:                    } catch (MalformedObjectNameException e) {
2259:                        throw new RuntimeOperationsException(
2260:                                new RuntimeException(e));
2261:                    } catch (UnknownHostException e) {
2262:                        throw new RuntimeOperationsException(
2263:                                new RuntimeException(e));
2264:                    } catch (NamingException e) {
2265:                        throw new RuntimeOperationsException(
2266:                                new RuntimeException(e));
2267:                    }
2268:                    return null;
2269:                }
2270:                if (operationName.equals(SHUTDOWN_OPER)) {
2271:                    JmxUtils.checkParamsCount(SHUTDOWN_OPER, params, 0);
2272:                    Heritrix.shutdown();
2273:                    return null;
2274:                }
2275:                if (operationName.equals(LOG_OPER)) {
2276:                    JmxUtils.checkParamsCount(LOG_OPER, params, 2);
2277:                    logger.log(Level.parse((String) params[0]),
2278:                            (String) params[1]);
2279:                    return null;
2280:                }
2281:                if (operationName.equals(INTERRUPT_OPER)) {
2282:                    JmxUtils.checkParamsCount(INTERRUPT_OPER, params, 1);
2283:                    return interrupt((String) params[0]);
2284:                }
2285:                if (operationName.equals(START_CRAWLING_OPER)) {
2286:                    JmxUtils.checkParamsCount(START_CRAWLING_OPER, params, 0);
2287:                    startCrawling();
2288:                    return null;
2289:                }
2290:                if (operationName.equals(STOP_CRAWLING_OPER)) {
2291:                    JmxUtils.checkParamsCount(STOP_CRAWLING_OPER, params, 0);
2292:                    stopCrawling();
2293:                    return null;
2294:                }
2295:                if (operationName.equals(ADD_CRAWL_JOB_OPER)) {
2296:                    JmxUtils.checkParamsCount(ADD_CRAWL_JOB_OPER, params, 4);
2297:                    try {
2298:                        return addCrawlJob((String) params[0],
2299:                                (String) params[1],
2300:                                checkForEmptyPlaceHolder((String) params[2]),
2301:                                checkForEmptyPlaceHolder((String) params[3]));
2302:                    } catch (IOException e) {
2303:                        throw new RuntimeOperationsException(
2304:                                new RuntimeException(e));
2305:                    } catch (FatalConfigurationException e) {
2306:                        throw new RuntimeOperationsException(
2307:                                new RuntimeException(e));
2308:                    }
2309:                }
2310:                if (operationName.equals(DELETE_CRAWL_JOB_OPER)) {
2311:                    JmxUtils.checkParamsCount(DELETE_CRAWL_JOB_OPER, params, 1);
2312:                    this .jobHandler.deleteJob((String) params[0]);
2313:                    return null;
2314:                }
2315:
2316:                if (operationName.equals(ADD_CRAWL_JOB_BASEDON_OPER)) {
2317:                    JmxUtils.checkParamsCount(ADD_CRAWL_JOB_BASEDON_OPER,
2318:                            params, 4);
2319:                    return addCrawlJobBasedOn((String) params[0],
2320:                            (String) params[1],
2321:                            checkForEmptyPlaceHolder((String) params[2]),
2322:                            checkForEmptyPlaceHolder((String) params[3]));
2323:                }
2324:                if (operationName.equals(ALERT_OPER)) {
2325:                    JmxUtils.checkParamsCount(ALERT_OPER, params, 1);
2326:                    SinkHandlerLogRecord slr = null;
2327:                    if (this .alertManager.getCount() > 0) {
2328:                        // This is creating a vector of all alerts just so I can then
2329:                        // use passed index into resultant vector -- needs to be
2330:                        // improved.
2331:                        slr = (SinkHandlerLogRecord) this .alertManager.getAll()
2332:                                .get(((Integer) params[0]).intValue());
2333:                    }
2334:                    return (slr != null) ? slr.toString() : null;
2335:                }
2336:
2337:                if (operationName.equals(PENDING_JOBS_OPER)) {
2338:                    JmxUtils.checkParamsCount(PENDING_JOBS_OPER, params, 0);
2339:                    try {
2340:                        return makeJobsTabularData(getJobHandler()
2341:                                .getPendingJobs());
2342:                    } catch (OpenDataException e) {
2343:                        throw new RuntimeOperationsException(
2344:                                new RuntimeException(e));
2345:                    }
2346:                }
2347:
2348:                if (operationName.equals(COMPLETED_JOBS_OPER)) {
2349:                    JmxUtils.checkParamsCount(COMPLETED_JOBS_OPER, params, 0);
2350:                    try {
2351:                        return makeJobsTabularData(getJobHandler()
2352:                                .getCompletedJobs());
2353:                    } catch (OpenDataException e) {
2354:                        throw new RuntimeOperationsException(
2355:                                new RuntimeException(e));
2356:                    }
2357:                }
2358:
2359:                if (operationName.equals(CRAWLEND_REPORT_OPER)) {
2360:                    JmxUtils.checkParamsCount(CRAWLEND_REPORT_OPER, params, 2);
2361:                    try {
2362:                        return getCrawlendReport((String) params[0],
2363:                                (String) params[1]);
2364:                    } catch (IOException e) {
2365:                        throw new RuntimeOperationsException(
2366:                                new RuntimeException(e));
2367:                    }
2368:                }
2369:
2370:                throw new ReflectionException(new NoSuchMethodException(
2371:                        operationName), "Cannot find the operation "
2372:                        + operationName);
2373:            }
2374:
2375:            /**
2376:             * Return named crawl end report for job with passed uid.
2377:             * Crawler makes reports when its finished its crawl.  Use this method
2378:             * to get a String version of one of these files.
2379:             * @param jobUid The unique ID for the job whose reports you want to see
2380:             * (Must be a completed job).
2381:             * @param reportName Name of report minus '.txt' (e.g. crawl-report).
2382:             * @return String version of the on-disk report.
2383:             * @throws IOException 
2384:             */
2385:            protected String getCrawlendReport(String jobUid, String reportName)
2386:                    throws IOException {
2387:                CrawlJob job = getJobHandler().getJob(jobUid);
2388:                if (job == null) {
2389:                    throw new IOException("No such job: " + jobUid);
2390:                }
2391:                File report = new File(job.getDirectory(), reportName + ".txt");
2392:                if (!report.exists()) {
2393:                    throw new FileNotFoundException(report.getAbsolutePath());
2394:                }
2395:                return FileUtils.readFileAsString(report);
2396:            }
2397:
2398:            protected TabularData makeJobsTabularData(List jobs)
2399:                    throws OpenDataException {
2400:                if (jobs == null || jobs.size() == 0) {
2401:                    return null;
2402:                }
2403:                TabularData td = new TabularDataSupport(this .jobsTabularType);
2404:                for (Iterator i = jobs.iterator(); i.hasNext();) {
2405:                    CrawlJob job = (CrawlJob) i.next();
2406:                    CompositeData cd = new CompositeDataSupport(
2407:                            this .jobCompositeType, JOB_KEYS, new String[] {
2408:                                    job.getUID(), job.getJobName(),
2409:                                    job.getStatus() });
2410:                    td.put(cd);
2411:                }
2412:                return td;
2413:            }
2414:
2415:            /**
2416:             * If passed str has placeholder for the empty string, return the empty
2417:             * string else return orginal.
2418:             * Dumb jmx clients can't pass empty string so they'll pass a representation
2419:             * of empty string such as ' ' or '-'.  Convert such strings to empty
2420:             * string.
2421:             * @param str String to check.
2422:             * @return Original <code>str</code> or empty string if <code>str</code>
2423:             * contains a placeholder for the empty-string (e.g. '-', or ' ').
2424:             */
2425:            protected String checkForEmptyPlaceHolder(String str) {
2426:                return TextUtils.matches("-| +", str) ? "" : str;
2427:            }
2428:
2429:            public MBeanInfo getMBeanInfo() {
2430:                return this .openMBeanInfo;
2431:            }
2432:
2433:            /**
2434:             * @return Name this instance registered in JMX (Only available after JMX
2435:             * registration).
2436:             */
2437:            public ObjectName getMBeanName() {
2438:                return this .mbeanName;
2439:            }
2440:
2441:            public ObjectName preRegister(MBeanServer server, ObjectName name)
2442:                    throws Exception {
2443:                this .mbeanServer = server;
2444:                @SuppressWarnings("unchecked")
2445:                Hashtable<String, String> ht = name.getKeyPropertyList();
2446:                if (!ht.containsKey(JmxUtils.NAME)) {
2447:                    throw new IllegalArgumentException("Name property required"
2448:                            + name.getCanonicalName());
2449:                }
2450:                if (!ht.containsKey(JmxUtils.TYPE)) {
2451:                    ht.put(JmxUtils.TYPE, JmxUtils.SERVICE);
2452:                    name = new ObjectName(name.getDomain(), ht);
2453:                }
2454:                this .mbeanName = addGuiPort(addVitals(name));
2455:                Heritrix.instances.put(this .mbeanName
2456:                        .getCanonicalKeyPropertyListString(), this );
2457:                return this .mbeanName;
2458:            }
2459:
2460:            /**
2461:             * Add vital stats to passed in ObjectName.
2462:             * @param name ObjectName to add to.
2463:             * @return name with host, guiport, and jmxport added.
2464:             * @throws UnknownHostException
2465:             * @throws MalformedObjectNameException
2466:             * @throws NullPointerException
2467:             */
2468:            protected static ObjectName addVitals(ObjectName name)
2469:                    throws UnknownHostException, MalformedObjectNameException,
2470:                    NullPointerException {
2471:                @SuppressWarnings("unchecked")
2472:                Hashtable<String, String> ht = name.getKeyPropertyList();
2473:                if (!ht.containsKey(JmxUtils.HOST)) {
2474:                    ht.put(JmxUtils.HOST, InetAddress.getLocalHost()
2475:                            .getHostName());
2476:                    name = new ObjectName(name.getDomain(), ht);
2477:                }
2478:                if (!ht.containsKey(JmxUtils.JMX_PORT)) {
2479:                    // Add jdk jmx-port. This will be present if we've attached
2480:                    // ourselves to the jdk jmx agent.  Otherwise, we've been
2481:                    // deployed in a j2ee container with its own jmx agent.  In
2482:                    // this case we won't know how to get jmx port.
2483:                    String p = System
2484:                            .getProperty("com.sun.management.jmxremote.port");
2485:                    if (p != null && p.length() > 0) {
2486:                        ht.put(JmxUtils.JMX_PORT, p);
2487:                        name = new ObjectName(name.getDomain(), ht);
2488:                    }
2489:                }
2490:                return name;
2491:            }
2492:
2493:            protected static ObjectName addGuiPort(ObjectName name)
2494:                    throws MalformedObjectNameException, NullPointerException {
2495:                @SuppressWarnings("unchecked")
2496:                Hashtable<String, String> ht = name.getKeyPropertyList();
2497:                if (!ht.containsKey(JmxUtils.GUI_PORT)) {
2498:                    // Add gui port if this instance was started with a gui.
2499:                    if (Heritrix.gui) {
2500:                        ht.put(JmxUtils.GUI_PORT, Integer
2501:                                .toString(Heritrix.guiPort));
2502:                        name = new ObjectName(name.getDomain(), ht);
2503:                    }
2504:                }
2505:                return name;
2506:            }
2507:
2508:            public void postRegister(Boolean registrationDone) {
2509:                if (logger.isLoggable(Level.INFO)) {
2510:                    logger.info(JmxUtils.getLogRegistrationMsg(this .mbeanName
2511:                            .getCanonicalName(), this .mbeanServer,
2512:                            registrationDone.booleanValue()));
2513:                }
2514:                try {
2515:                    registerJndi(this .mbeanName);
2516:                } catch (Exception e) {
2517:                    logger.log(Level.SEVERE, "Failed jndi registration", e);
2518:                }
2519:            }
2520:
2521:            public void preDeregister() throws Exception {
2522:                deregisterJndi(this .mbeanName);
2523:            }
2524:
2525:            public void postDeregister() {
2526:                Heritrix.instances.remove(this .mbeanName
2527:                        .getCanonicalKeyPropertyListString());
2528:                if (logger.isLoggable(Level.INFO)) {
2529:                    logger.info(JmxUtils.getLogUnregistrationMsg(this .mbeanName
2530:                            .getCanonicalName(), this .mbeanServer));
2531:                }
2532:            }
2533:
2534:            protected static void registerContainerJndi()
2535:                    throws MalformedObjectNameException, NullPointerException,
2536:                    UnknownHostException, NamingException {
2537:                registerJndi(getJndiContainerName());
2538:            }
2539:
2540:            protected static void registerJndi(final ObjectName name)
2541:                    throws NullPointerException, NamingException {
2542:                Context c = getJndiContext();
2543:                if (c == null) {
2544:                    return;
2545:                }
2546:                CompoundName key = JndiUtils.bindObjectName(c, name);
2547:                if (logger.isLoggable(Level.FINE)) {
2548:                    logger.fine("Bound '"
2549:                            + key
2550:                            + "' to '"
2551:                            + JndiUtils.getCompoundName(c.getNameInNamespace())
2552:                                    .toString() + "' jndi context");
2553:                }
2554:            }
2555:
2556:            protected static void deregisterJndi(final ObjectName name)
2557:                    throws NullPointerException, NamingException {
2558:                Context c = getJndiContext();
2559:                if (c == null) {
2560:                    return;
2561:                }
2562:                CompoundName key = JndiUtils.unbindObjectName(c, name);
2563:                if (logger.isLoggable(Level.FINE)) {
2564:                    logger.fine("Unbound '"
2565:                            + key
2566:                            + "' from '"
2567:                            + JndiUtils.getCompoundName(c.getNameInNamespace())
2568:                                    .toString() + "' jndi context");
2569:                }
2570:            }
2571:
2572:            /**
2573:             * @return Jndi context for the crawler or null if none found.
2574:             * @throws NamingException 
2575:             */
2576:            protected static Context getJndiContext() throws NamingException {
2577:                Context c = null;
2578:                try {
2579:                    c = JndiUtils.getSubContext(CRAWLER_PACKAGE);
2580:                } catch (NoInitialContextException e) {
2581:                    logger.fine("No JNDI Context: " + e.toString());
2582:                }
2583:                return c;
2584:            }
2585:
2586:            /**
2587:             * @return Jndi container name -- the name to use for the 'container' that
2588:             * can host zero or more heritrix instances (Return a JMX ObjectName.  We
2589:             * use ObjectName because then we're sync'd with JMX naming and ObjectName
2590:             * has nice parsing).
2591:             * @throws NullPointerException 
2592:             * @throws MalformedObjectNameException 
2593:             * @throws UnknownHostException 
2594:             */
2595:            protected static ObjectName getJndiContainerName()
2596:                    throws MalformedObjectNameException, NullPointerException,
2597:                    UnknownHostException {
2598:                ObjectName objName = new ObjectName(CRAWLER_PACKAGE, "type",
2599:                        "container");
2600:                return addVitals(objName);
2601:            }
2602:
2603:            /**
2604:             * @return Return all registered instances of Heritrix (Rare are there 
2605:             * more than one).
2606:             */
2607:            public static Map getInstances() {
2608:                return Heritrix.instances;
2609:            }
2610:
2611:            /**
2612:             * @return True if only one instance of Heritrix.
2613:             */
2614:            public static boolean isSingleInstance() {
2615:                return Heritrix.instances != null
2616:                        && Heritrix.instances.size() == 1;
2617:            }
2618:
2619:            /**
2620:             * @return Returns single instance or null if no instance or multiple.
2621:             */
2622:            public static Heritrix getSingleInstance() {
2623:                return !isSingleInstance() ? null
2624:                        : (Heritrix) Heritrix.instances.get(Heritrix.instances
2625:                                .keySet().iterator().next());
2626:            }
2627:        }
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.