Source Code Cross Referenced for XmlCrawlerConfig.java in » Search-Engine » regain » net » sf » regain » crawler » config » Java Source Code / Java DocumentationJava Source Code and Java Documentation

Java Source Code / Java Documentation
1.	6.0 JDK Core
2.	6.0 JDK Modules
3.	6.0 JDK Modules com.sun
4.	6.0 JDK Modules com.sun.java
5.	6.0 JDK Modules sun
6.	6.0 JDK Platform
7.	Ajax
8.	Apache Harmony Java SE
9.	Aspect oriented
10.	Authentication Authorization
11.	Blogger System
12.	Build
13.	Byte Code
14.	Cache
15.	Chart
16.	Chat
17.	Code Analyzer
18.	Collaboration
19.	Content Management System
20.	Database Client
21.	Database DBMS
22.	Database JDBC Connection Pool
23.	Database ORM
24.	Development
25.	EJB Server geronimo
26.	EJB Server GlassFish
27.	EJB Server JBoss 4.2.1
28.	EJB Server resin 3.1.5
29.	ERP CRM Financial
30.	ESB
31.	Forum
32.	GIS
33.	Graphic Library
34.	Groupware
35.	HTML Parser
36.	IDE
37.	IDE Eclipse
38.	IDE Netbeans
39.	Installer
40.	Internationalization Localization
41.	Inversion of Control
42.	Issue Tracking
43.	J2EE
44.	JBoss
45.	JMS
46.	JMX
47.	Library
48.	Mail Clients
49.	Net
50.	Parser
51.	PDF
52.	Portal
53.	Profiler
54.	Project Management
55.	Report
56.	RSS RDF
57.	Rule Engine
58.	Science
59.	Scripting
60.	Search Engine
61.	Security
62.	Sevlet Container
63.	Source Control
64.	Swing Library
65.	Template Engine
66.	Test Coverage
67.	Testing
68.	UML
69.	Web Crawler
70.	Web Framework
71.	Web Mail
72.	Web Server
73.	Web Services
74.	Web Services apache cxf 2.0.1
75.	Web Services AXIS2
76.	Wiki Engine
77.	Workflow Engines
78.	XML
79.	XML UI
Java
Java Tutorial
Illustrator Tutorials
GIMP Tutorials
C# / C Sharp
C# / CSharp Tutorial
C# / CSharp Open Source
SQL Server / T-SQL Tutorial
Oracle PL / SQL
Oracle PL/SQL Tutorial
Flash / Flex / ActionScript
VBA / Excel / Access / Word
XML
XML Tutorial
Microsoft Office PowerPoint 2007 Tutorial
Microsoft Office Excel 2007 Tutorial
Microsoft Office Word 2007 Tutorial
Java Source Code / Java Documentation » Search Engine » regain » net.sf.regain.crawler.config
Source Cross Referenced Class Diagram Java Document (Java Doc)
001:        /*
002:         * regain - A file search engine providing plenty of formats
003:         * Copyright (C) 2004  Til Schneider
004:         *
005:         * This library is free software; you can redistribute it and/or
006:         * modify it under the terms of the GNU Lesser General Public
007:         * License as published by the Free Software Foundation; either
008:         * version 2.1 of the License, or (at your option) any later version.
009:         *
010:         * This library is distributed in the hope that it will be useful,
011:         * but WITHOUT ANY WARRANTY; without even the implied warranty of
012:         * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
013:         * Lesser General Public License for more details.
014:         *
015:         * You should have received a copy of the GNU Lesser General Public
016:         * License along with this library; if not, write to the Free Software
017:         * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
018:         *
019:         * Contact: Til Schneider, info@murfman.de
020:         *
021:         * CVS information:
022:         *  $RCSfile$
023:         *   $Source$
024:         *     $Date: 2007-11-01 13:53:31 +0100 (Do, 01 Nov 2007) $
025:         *   $Author: til132 $
026:         * $Revision: 260 $
027:         */
028:        package net.sf.regain.crawler.config;
029:
030:        import java.io.File;
031:        import java.util.HashMap;
032:        import java.util.Properties;
033:
034:        import net.sf.regain.RegainException;
035:        import net.sf.regain.XmlToolkit;
036:
037:        import org.apache.regexp.RE;
038:        import org.apache.regexp.RESyntaxException;
039:        import org.w3c.dom.Document;
040:        import org.w3c.dom.Element;
041:        import org.w3c.dom.Node;
042:
043:        /**
044:         * Liest die konfigurierenden Einstellungen aus einer XML-Datei und stellt sie
045:         * zur Verf�gung.
046:         *
047:         * @author Til Schneider, www.murfman.de
048:         */
049:        public class XmlCrawlerConfig implements  CrawlerConfig {
050:
051:            /** Der Host-Namen des Proxy-Servers. */
052:            private String mProxyHost;
053:            /** Der Port des Proxy-Servers. */
054:            private String mProxyPort;
055:            /** Der Benutzernamen f�r die Anmeldung beim Proxy-Server. */
056:            private String mProxyUser;
057:            /** Das Passwort f�r die Anmeldung beim Proxy-Server. */
058:            private String mProxyPassword;
059:            /** The user agent the crawler should in order to identify at the HTTP server(s). */
060:            private String mUserAgent;
061:            /**
062:             * Gibt an, ob URLs geladen werden sollen, die weder durchsucht noch indiziert
063:             * werden.
064:             */
065:            private boolean mLoadUnparsedUrls;
066:            /** Gibt an, ob ein Suchindex erstellt werden soll. */
067:            private boolean mBuildIndex;
068:            /**
069:             * Der Timeout f�r HTTP-Downloads. Dieser Wert bestimmt die maximale Zeit
070:             * in Sekunden, die ein HTTP-Download insgesamt dauern darf.
071:             */
072:            private int mHttpTimeoutSecs;
073:            /** Das Verzeichnis, in dem der Suchindex stehen soll. */
074:            private String mIndexDir;
075:
076:            /** The maximum number of terms per document. */
077:            private int mMaxFieldLength;
078:
079:            /** Der zu verwendende Analyzer-Typ. */
080:            private String mAnalyzerType;
081:
082:            /** Enth�lt alle Worte, die nicht indiziert werden sollen. */
083:            private String[] mStopWordList;
084:            /**
085:             * Enth�lt alle Worte, die bei der Indizierung nicht vom Analyzer ver�ndert
086:             * werden sollen.
087:             */
088:            private String[] mExclusionList;
089:
090:            /** Gibt an, ob Analyse-Deteien geschrieben werden sollen. */
091:            private boolean mWriteAnalysisFiles;
092:            /** The interval between two breakpoint in minutes. */
093:            private int mBreakpointInterval;
094:            /**
095:             * Der maximale Prozentsatz von gescheiterten Dokumenten (0..100), der f�r
096:             * die Freigabe eines Index toleriert wird.
097:             */
098:            private double mMaxFailedDocuments;
099:
100:            /** Der Nam der Kontrolldatei f�r erfolgreiche Indexerstellung. */
101:            private String mFinishedWithoutFatalsFileName;
102:            /** Der Name der Kontrolldatei f�r fehlerhafte Indexerstellung. */
103:            private String mFinishedWithFatalsFileName;
104:
105:            /** Die StartUrls. */
106:            private StartUrl[] mStartUrls;
107:
108:            /** Die UrlPattern, die der HTML-Parser nutzen soll, um URLs zu identifizieren. */
109:            private UrlPattern[] mHtmlParserUrlPatterns;
110:
111:            /** The black list. */
112:            private UrlMatcher[] mBlackList;
113:            /** The white list. */
114:            private WhiteListEntry[] mWhiteListEntryArr;
115:
116:            /** The names of the fields to prefetch the destinct values for. */
117:            private String[] mValuePrefetchFields;
118:
119:            /**
120:             * Die regul�ren Ausdr�cke, auf die die URL eines Dokuments passen muss,
121:             * damit anstatt des wirklichen Dokumententitels der Text des Links, der auf
122:             * das Dokument gezeigt hat, als Dokumententitel genutzt wird.
123:             */
124:            private String[] mUseLinkTextAsTitleRegexList;
125:
126:            /** The list with the preparator settings. */
127:            private PreparatorSettings[] mPreparatorSettingsArr;
128:
129:            /** The list of the auxiliary fields. May be null. */
130:            private AuxiliaryField[] mAuxiliaryFieldArr;
131:
132:            /** The class name of the CrawlerAccessController to use. */
133:            private String mCrawlerAccessControllerClass;
134:            /** The name of jar file to load the CrawlerAccessController from. */
135:            private String mCrawlerAccessControllerJar;
136:            /** The configuration of the CrawlerAccessController. */
137:            private Properties mCrawlerAccessControllerConfig;
138:
139:            /**
140:             * Erzeugt eine neue XmlConfiguration-Instanz.
141:             *
142:             * @param xmlFile Die XML-Datei, aus der die Konfiguration gelesen werden soll.
143:             *
144:             * @throws RegainException Falls die Konfiguration nicht korrekt gelesen werden
145:             *         konnte.
146:             */
147:            public XmlCrawlerConfig(File xmlFile) throws RegainException {
148:                Document doc = XmlToolkit.loadXmlDocument(xmlFile);
149:                Element config = doc.getDocumentElement();
150:
151:                readProxyConfig(config);
152:                readLoadUnparsedUrls(config);
153:                readHttpTimeoutSecs(config);
154:                readUserAgent(config);
155:                readIndexConfig(config);
156:                readControlFileConfig(config);
157:                readStartUrls(config);
158:                readHtmlParserUrlPatterns(config);
159:                readBlackList(config);
160:                readWhiteList(config);
161:                readUseLinkTextAsTitleRegexList(config);
162:                readPreparatorSettingsList(config, xmlFile);
163:                readAuxiliaryFieldList(config);
164:                readCrawlerAccessController(config);
165:            }
166:
167:            /**
168:             * Liest aus der Konfiguration, ob Dokumente geladen werden sollen, die weder
169:             * indiziert, noch auf URLs durchsucht werden.
170:             *
171:             * @param config Die Konfiguration, aus der gelesen werden soll.
172:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
173:             */
174:            private void readLoadUnparsedUrls(Element config)
175:                    throws RegainException {
176:                Node node = XmlToolkit.getChild(config, "loadUnparsedUrls");
177:                mLoadUnparsedUrls = (node == null) ? false : XmlToolkit
178:                        .getTextAsBoolean(node);
179:            }
180:
181:            /**
182:             * Liest den Timeout f�r HTTP-Downloads aus der Konfiguration.
183:             *
184:             * @param config Die Konfiguration, aus der gelesen werden soll.
185:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
186:             */
187:            private void readHttpTimeoutSecs(Element config)
188:                    throws RegainException {
189:                Node node = XmlToolkit.getChild(config, "httpTimeout");
190:                mHttpTimeoutSecs = (node == null) ? 180 : XmlToolkit
191:                        .getTextAsInt(node);
192:            }
193:
194:            /**
195:             * Reads the user agent from the config.
196:             *
197:             * @param config The configuration to read from.
198:             * @throws RegainException If the configuration has an error.
199:             */
200:            private void readUserAgent(Element config) throws RegainException {
201:                Node node = XmlToolkit.getChild(config, "userAgent", false);
202:                if (node != null) {
203:                    mUserAgent = XmlToolkit.getText(node);
204:                }
205:            }
206:
207:            /**
208:             * Liest die Proxy-Einstellungen aus der Konfiguration.
209:             *
210:             * @param config Die Konfiguration, aus der gelesen werden soll.
211:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
212:             */
213:            private void readProxyConfig(Node config) throws RegainException {
214:                Node node;
215:
216:                Node proxyNode = XmlToolkit.getChild(config, "proxy");
217:                if (proxyNode != null) {
218:                    node = XmlToolkit.getChild(proxyNode, "host");
219:                    if (node != null) {
220:                        mProxyHost = XmlToolkit.getText(node, true);
221:                    }
222:                    node = XmlToolkit.getChild(proxyNode, "port");
223:                    if (node != null) {
224:                        mProxyPort = XmlToolkit.getText(node, true);
225:                    }
226:                    node = XmlToolkit.getChild(proxyNode, "user");
227:                    if (node != null) {
228:                        mProxyUser = XmlToolkit.getText(node, true);
229:                    }
230:                    node = XmlToolkit.getChild(proxyNode, "password");
231:                    if (node != null) {
232:                        mProxyPassword = XmlToolkit.getText(node, true);
233:                    }
234:                }
235:            }
236:
237:            /**
238:             * Liest die Einstellungen aus der Konfiguration, die den Suchindex betreffen.
239:             *
240:             * @param config Die Konfiguration, aus der gelesen werden soll.
241:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
242:             */
243:            private void readIndexConfig(Node config) throws RegainException {
244:                Node node;
245:
246:                Node indexNode = XmlToolkit.getChild(config, "searchIndex",
247:                        true);
248:
249:                node = XmlToolkit.getChild(indexNode, "dir", true);
250:                mIndexDir = XmlToolkit.getText(node, true);
251:                node = XmlToolkit.getChild(indexNode, "buildIndex");
252:                mBuildIndex = (node == null) ? true : XmlToolkit
253:                        .getTextAsBoolean(node);
254:                node = XmlToolkit.getChild(indexNode, "analyzerType", true);
255:                mAnalyzerType = XmlToolkit.getText(node, true);
256:                node = XmlToolkit.getChild(indexNode, "maxFieldLength", false);
257:                mMaxFieldLength = (node == null) ? -1 : XmlToolkit
258:                        .getTextAsInt(node);
259:                node = XmlToolkit.getChild(indexNode, "stopwordList", false);
260:                mStopWordList = (node == null) ? null : XmlToolkit
261:                        .getTextAsWordList(node, true);
262:                node = XmlToolkit.getChild(indexNode, "exclusionList", true);
263:                mExclusionList = XmlToolkit.getTextAsWordList(node, false);
264:                node = XmlToolkit.getChild(indexNode, "writeAnalysisFiles");
265:                mWriteAnalysisFiles = (node == null) ? false : XmlToolkit
266:                        .getTextAsBoolean(node);
267:
268:                node = XmlToolkit.getChild(indexNode, "breakpointInterval");
269:                mBreakpointInterval = (node == null) ? 10 : XmlToolkit
270:                        .getTextAsInt(node);
271:
272:                node = XmlToolkit.getChild(indexNode, "maxFailedDocuments");
273:                mMaxFailedDocuments = (node == null) ? 1.0 : (XmlToolkit
274:                        .getTextAsDouble(node) / 100.0);
275:
276:                node = XmlToolkit.getChild(indexNode, "valuePrefetchFields",
277:                        false);
278:                mValuePrefetchFields = (node == null) ? null : XmlToolkit
279:                        .getTextAsWordList(node, false);
280:            }
281:
282:            /**
283:             * Liest die Namen der Kontrolldateien aus der Konfiguration.
284:             *
285:             * @param config Die Konfiguration, aus der gelesen werden soll.
286:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
287:             */
288:            private void readControlFileConfig(Node config)
289:                    throws RegainException {
290:                Node node;
291:
292:                Node ctrNode = XmlToolkit.getChild(config, "controlFiles");
293:                if (ctrNode != null) {
294:                    node = XmlToolkit.getChild(ctrNode,
295:                            "finishedWithoutFatalsFile");
296:                    if (node != null) {
297:                        mFinishedWithoutFatalsFileName = XmlToolkit.getText(
298:                                node, true).trim();
299:                    }
300:
301:                    node = XmlToolkit.getChild(ctrNode,
302:                            "finishedWithFatalsFile");
303:                    if (node != null) {
304:                        mFinishedWithFatalsFileName = XmlToolkit.getText(node,
305:                                true).trim();
306:                    }
307:                }
308:            }
309:
310:            /**
311:             * Liest die Start-URLs aus der Konfiguration.
312:             *
313:             * @param config Die Konfiguration, aus der gelesen werden soll.
314:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
315:             */
316:            private void readStartUrls(Node config) throws RegainException {
317:                Node node = XmlToolkit.getChild(config, "startlist", true);
318:                Node[] nodeArr = XmlToolkit.getChildArr(node, "start");
319:                mStartUrls = new StartUrl[nodeArr.length];
320:                for (int i = 0; i < nodeArr.length; i++) {
321:                    String url = XmlToolkit.getTextAsUrl(nodeArr[i]);
322:                    boolean parse = XmlToolkit.getAttributeAsBoolean(
323:                            nodeArr[i], "parse");
324:                    boolean index = XmlToolkit.getAttributeAsBoolean(
325:                            nodeArr[i], "index");
326:
327:                    mStartUrls[i] = new StartUrl(url, parse, index);
328:                }
329:            }
330:
331:            /**
332:             * Liest die URL-Patterns f�r den HTML-Parser aus der Konfiguration.
333:             * <p>
334:             * Diese werden beim durchsuchen eines HTML-Dokuments dazu verwendet, URLs
335:             * zu identifizieren.
336:             *
337:             * @param config Die Konfiguration, aus der gelesen werden soll.
338:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
339:             */
340:            private void readHtmlParserUrlPatterns(Node config)
341:                    throws RegainException {
342:                Node node = XmlToolkit.getChild(config,
343:                        "htmlParserPatternList", true);
344:                Node[] nodeArr = XmlToolkit.getChildArr(node, "pattern");
345:                mHtmlParserUrlPatterns = new UrlPattern[nodeArr.length];
346:                for (int i = 0; i < nodeArr.length; i++) {
347:                    String regexPattern = XmlToolkit.getText(nodeArr[i], true);
348:                    int regexGroup = XmlToolkit.getAttributeAsInt(nodeArr[i],
349:                            "regexGroup");
350:                    boolean parse = XmlToolkit.getAttributeAsBoolean(
351:                            nodeArr[i], "parse");
352:                    boolean index = XmlToolkit.getAttributeAsBoolean(
353:                            nodeArr[i], "index");
354:
355:                    mHtmlParserUrlPatterns[i] = new UrlPattern(regexPattern,
356:                            regexGroup, parse, index);
357:                }
358:            }
359:
360:            /**
361:             * Reads the black list from the configuration.
362:             * <p>
363:             * Documents that have an URL that matches to one entry of the black list,
364:             * won't be processed.
365:             *
366:             * @param config The configuration to read from.
367:             * @throws RegainException If the configuration has an error.
368:             */
369:            private void readBlackList(Node config) throws RegainException {
370:                Node node = XmlToolkit.getChild(config, "blacklist", true);
371:                Node[] prefixNodeArr = XmlToolkit.getChildArr(node, "prefix");
372:                Node[] regexNodeArr = XmlToolkit.getChildArr(node, "regex");
373:
374:                mBlackList = new UrlMatcher[prefixNodeArr.length
375:                        + regexNodeArr.length];
376:                for (int i = 0; i < prefixNodeArr.length; i++) {
377:                    String prefix = XmlToolkit.getText(prefixNodeArr[i], true);
378:                    mBlackList[i] = new PrefixUrlMatcher(prefix);
379:                }
380:                for (int i = 0; i < regexNodeArr.length; i++) {
381:                    String regex = XmlToolkit.getText(regexNodeArr[i], true);
382:                    mBlackList[prefixNodeArr.length + i] = new RegexUrlMatcher(
383:                            regex);
384:                }
385:            }
386:
387:            /**
388:             * Reads the white list from the configuration.
389:             * <p>
390:             * Documents will only be processed if their URL matches to one entry from the
391:             * white list.
392:             *
393:             * @param config The configuration to read from.
394:             * @throws RegainException If the configuration has an error.
395:             */
396:            private void readWhiteList(Node config) throws RegainException {
397:                Node node = XmlToolkit.getChild(config, "whitelist", true);
398:                Node[] prefixNodeArr = XmlToolkit.getChildArr(node, "prefix");
399:                Node[] regexNodeArr = XmlToolkit.getChildArr(node, "regex");
400:
401:                mWhiteListEntryArr = new WhiteListEntry[prefixNodeArr.length
402:                        + regexNodeArr.length];
403:                for (int i = 0; i < prefixNodeArr.length; i++) {
404:                    String prefix = XmlToolkit.getText(prefixNodeArr[i], true);
405:                    UrlMatcher matcher = new PrefixUrlMatcher(prefix);
406:                    String name = XmlToolkit.getAttribute(prefixNodeArr[i],
407:                            "name");
408:                    mWhiteListEntryArr[i] = new WhiteListEntry(matcher, name);
409:                }
410:                for (int i = 0; i < regexNodeArr.length; i++) {
411:                    String regex = XmlToolkit.getText(regexNodeArr[i], true);
412:                    UrlMatcher matcher = new RegexUrlMatcher(regex);
413:                    String name = XmlToolkit.getAttribute(regexNodeArr[i],
414:                            "name");
415:                    mWhiteListEntryArr[prefixNodeArr.length + i] = new WhiteListEntry(
416:                            matcher, name);
417:                }
418:            }
419:
420:            /**
421:             * Liest die Liste der regul�ren Ausdr�cke aus der Konfiguration, auf die die
422:             * URL eines Dokuments passen muss, damit anstatt des wirklichen
423:             * Dokumententitels der Text des Links, der auf das Dokument gezeigt hat, als
424:             * Dokumententitel genutzt wird.
425:             *
426:             * @param config Die Konfiguration, aus der gelesen werden soll.
427:             * @throws RegainException Wenn die Konfiguration fehlerhaft ist.
428:             */
429:            private void readUseLinkTextAsTitleRegexList(Node config)
430:                    throws RegainException {
431:                Node node = XmlToolkit.getChild(config,
432:                        "useLinkTextAsTitleList");
433:                if (node == null) {
434:                    mUseLinkTextAsTitleRegexList = new String[0];
435:                } else {
436:                    Node[] nodeArr = XmlToolkit.getChildArr(node, "urlPattern");
437:                    mUseLinkTextAsTitleRegexList = new String[nodeArr.length];
438:                    for (int i = 0; i < nodeArr.length; i++) {
439:                        mUseLinkTextAsTitleRegexList[i] = XmlToolkit.getText(
440:                                nodeArr[i], true);
441:                    }
442:                }
443:            }
444:
445:            /**
446:             * Reads the list of preparator settings.
447:             *
448:             * @param config The configuration to read from
449:             * @param xmlFile The file the configuration was read from.
450:             * @throws RegainException If the configuration has errors.
451:             */
452:            private void readPreparatorSettingsList(Node config, File xmlFile)
453:                    throws RegainException {
454:                Node node = XmlToolkit.getChild(config, "preparatorList", true);
455:                Node[] nodeArr = XmlToolkit.getChildArr(node, "preparator");
456:                mPreparatorSettingsArr = new PreparatorSettings[nodeArr.length];
457:                for (int i = 0; i < nodeArr.length; i++) {
458:                    boolean enabled = XmlToolkit.getAttributeAsBoolean(
459:                            nodeArr[i], "enabled", true);
460:
461:                    node = XmlToolkit.getChild(nodeArr[i], "class", true);
462:                    String className = XmlToolkit.getText(node, true);
463:
464:                    node = XmlToolkit.getChild(nodeArr[i], "urlPattern", false);
465:                    String urlRegex = null;
466:                    if (node != null) {
467:                        urlRegex = XmlToolkit.getText(node, true);
468:                    }
469:
470:                    node = XmlToolkit.getChild(nodeArr[i], "config");
471:                    PreparatorConfig prepConfig;
472:                    if (node != null) {
473:                        prepConfig = readPreparatorConfig(node, xmlFile,
474:                                className);
475:                    } else {
476:                        prepConfig = new PreparatorConfig();
477:                    }
478:
479:                    mPreparatorSettingsArr[i] = new PreparatorSettings(enabled,
480:                            className, urlRegex, prepConfig);
481:                }
482:            }
483:
484:            /**
485:             * Reads the list of auxiliary fields.
486:             *
487:             * @param config The configuration to read from
488:             * @throws RegainException If the configuration has errors.
489:             */
490:            private void readAuxiliaryFieldList(Node config)
491:                    throws RegainException {
492:                Node node = XmlToolkit.getChild(config, "auxiliaryFieldList");
493:                if (node != null) {
494:                    Node[] nodeArr = XmlToolkit.getChildArr(node,
495:                            "auxiliaryField");
496:                    mAuxiliaryFieldArr = new AuxiliaryField[nodeArr.length];
497:                    for (int i = 0; i < nodeArr.length; i++) {
498:                        String fieldName = XmlToolkit.getAttribute(nodeArr[i],
499:                                "name", true);
500:                        RE urlRegex = readRegexChild(nodeArr[i]);
501:                        String value = XmlToolkit.getAttribute(nodeArr[i],
502:                                "value");
503:                        boolean toLowerCase = XmlToolkit.getAttributeAsBoolean(
504:                                nodeArr[i], "toLowerCase", true);
505:                        int urlRegexGroup = XmlToolkit.getAttributeAsInt(
506:                                nodeArr[i], "regexGroup", -1);
507:                        if ((value == null) && (urlRegexGroup == -1)) {
508:                            throw new RegainException(
509:                                    "The node 'auxiliaryField' must have "
510:                                            + "either the attribute 'value' or the attribute 'regexGroup'");
511:                        }
512:
513:                        boolean store = XmlToolkit.getAttributeAsBoolean(
514:                                nodeArr[i], "store", true);
515:                        boolean index = XmlToolkit.getAttributeAsBoolean(
516:                                nodeArr[i], "index", true);
517:                        boolean tokenize = XmlToolkit.getAttributeAsBoolean(
518:                                nodeArr[i], "tokenize", false);
519:
520:                        mAuxiliaryFieldArr[i] = new AuxiliaryField(fieldName,
521:                                value, toLowerCase, urlRegex, urlRegexGroup,
522:                                store, index, tokenize);
523:                    }
524:                }
525:            }
526:
527:            /**
528:             * Reads the regex child node from a node. Can also read the old style, where
529:             * the regex is directly in the node text.
530:             * 
531:             * @param node The node to read the regex node from
532:             * @return The compiled regular expression
533:             * @throws RegainException If there is no regular expression or if the regex
534:             *         could not be compiled.
535:             */
536:            private RE readRegexChild(Node node) throws RegainException {
537:                // Check whether the node has a regex child node
538:                Node regexNode = XmlToolkit.getChild(node, "regex");
539:                if (regexNode != null) {
540:                    boolean caseSensitive = XmlToolkit.getAttributeAsBoolean(
541:                            regexNode, "caseSensitive", false);
542:                    String regex = XmlToolkit.getText(regexNode, true);
543:
544:                    int flags = caseSensitive ? RE.MATCH_NORMAL
545:                            : RE.MATCH_CASEINDEPENDENT;
546:                    try {
547:                        return new RE(regex, flags);
548:                    } catch (RESyntaxException exc) {
549:                        throw new RegainException("Regex of node '"
550:                                + node.getNodeName()
551:                                + "' has a wrong syntax: '" + regex + "'", exc);
552:                    }
553:                } else {
554:                    // This is the old style -> Use the text as regex
555:                    String regex = XmlToolkit.getText(node, true);
556:                    try {
557:                        return new RE(regex, RE.MATCH_CASEINDEPENDENT);
558:                    } catch (RESyntaxException exc) {
559:                        throw new RegainException("Regex of node '"
560:                                + node.getNodeName()
561:                                + "' has a wrong syntax: '" + regex + "'", exc);
562:                    }
563:                }
564:            }
565:
566:            /**
567:             * Reads the configuration of a preparator from a node.
568:             * 
569:             * @param prepConfig The node to read the preparator config from.
570:             * @param xmlFile The file the configuration was read from.
571:             * @param className The class name of the preparator.
572:             * @return The configuration of a preparator.
573:             * @throws RegainException If the configuration has errors.
574:             */
575:            private PreparatorConfig readPreparatorConfig(Node prepConfig,
576:                    File xmlFile, String className) throws RegainException {
577:                // Check whether the config is in a extra file
578:                String extraFileName = XmlToolkit.getAttribute(prepConfig,
579:                        "file");
580:                if (extraFileName != null) {
581:                    File extraFile = new File(xmlFile.getParentFile(),
582:                            extraFileName);
583:                    Document doc = XmlToolkit.loadXmlDocument(extraFile);
584:                    prepConfig = doc.getDocumentElement();
585:                }
586:
587:                // Read the sections
588:                PreparatorConfig config = new PreparatorConfig();
589:                Node[] sectionArr = XmlToolkit.getChildArr(prepConfig,
590:                        "section");
591:                for (int secIdx = 0; secIdx < sectionArr.length; secIdx++) {
592:                    String sectionName = XmlToolkit.getAttribute(
593:                            sectionArr[secIdx], "name", true);
594:
595:                    // Read the params
596:                    HashMap paramMap = new HashMap();
597:                    Node[] paramArr = XmlToolkit.getChildArr(
598:                            sectionArr[secIdx], "param");
599:                    for (int paramIdx = 0; paramIdx < paramArr.length; paramIdx++) {
600:                        String paramName = XmlToolkit.getAttribute(
601:                                paramArr[paramIdx], "name", true);
602:                        String paramValue = XmlToolkit.getText(
603:                                paramArr[paramIdx], true);
604:
605:                        if (paramMap.containsKey(paramName)) {
606:                            throw new RegainException(
607:                                    "Preparator configuration of '" + className
608:                                            + "' has multiple '" + paramName
609:                                            + "' parameters in section '"
610:                                            + sectionName + "'");
611:                        }
612:
613:                        paramMap.put(paramName, paramValue);
614:                    }
615:
616:                    config.addSection(sectionName, paramMap);
617:                }
618:
619:                return config;
620:            }
621:
622:            /**
623:             * Reads which CrawlerAccessController to use.
624:             *
625:             * @param config The configuration to read from.
626:             * @throws RegainException If the configuration has errors.
627:             */
628:            private void readCrawlerAccessController(Node config)
629:                    throws RegainException {
630:                Node node = XmlToolkit.getChild(config,
631:                        "crawlerAccessController");
632:                if (node != null) {
633:                    Node classNode = XmlToolkit.getChild(node, "class", true);
634:                    mCrawlerAccessControllerClass = XmlToolkit.getText(
635:                            classNode, true);
636:                    mCrawlerAccessControllerJar = XmlToolkit.getAttribute(
637:                            classNode, "jar");
638:
639:                    Node configNode = XmlToolkit.getChild(node, "config");
640:                    if (configNode != null) {
641:                        mCrawlerAccessControllerConfig = new Properties();
642:                        Node[] paramNodeArr = XmlToolkit.getChildArr(
643:                                configNode, "param");
644:                        for (int i = 0; i < paramNodeArr.length; i++) {
645:                            String name = XmlToolkit.getAttribute(
646:                                    paramNodeArr[i], "name", true);
647:                            String value = XmlToolkit.getText(paramNodeArr[i],
648:                                    true);
649:                            mCrawlerAccessControllerConfig.setProperty(name,
650:                                    value);
651:                        }
652:                    }
653:                }
654:            }
655:
656:            /**
657:             * Gibt den Host-Namen des Proxy-Servers zur�ck. Wenn kein Host konfiguriert
658:             * wurde, wird <CODE>null</CODE> zur�ckgegeben.
659:             *
660:             * @return Der Host-Namen des Proxy-Servers.
661:             */
662:            public String getProxyHost() {
663:                return mProxyHost;
664:            }
665:
666:            /**
667:             * Gibt den Port des Proxy-Servers zur�ck. Wenn kein Port konfiguriert wurde,
668:             * wird <CODE>null</CODE> zur�ckgegeben.
669:             *
670:             * @return Der Port des Proxy-Servers.
671:             */
672:            public String getProxyPort() {
673:                return mProxyPort;
674:            }
675:
676:            /**
677:             * Gibt den Benutzernamen f�r die Anmeldung beim Proxy-Server zur�ck. Wenn
678:             * kein Benutzernamen konfiguriert wurde, wird <CODE>null</CODE> zur�ckgegeben.
679:             *
680:             * @return Der Benutzernamen f�r die Anmeldung beim Proxy-Server.
681:             */
682:            public String getProxyUser() {
683:                return mProxyUser;
684:            }
685:
686:            /**
687:             * Gibt das Passwort f�r die Anmeldung beim Proxy-Server zur�ck. Wenn kein
688:             * Passwort konfiguriert wurde, wird <CODE>null</CODE> zur�ckgegeben.
689:             *
690:             * @return Das Passwort f�r die Anmeldung beim Proxy-Server.
691:             */
692:            public String getProxyPassword() {
693:                return mProxyPassword;
694:            }
695:
696:            // overridden
697:            public String getUserAgent() {
698:                return mUserAgent;
699:            }
700:
701:            /**
702:             * Gibt den Timeout f�r HTTP-Downloads zur�ck. Dieser Wert bestimmt die
703:             * maximale Zeit in Sekunden, die ein HTTP-Download insgesamt dauern darf.
704:             *
705:             * @return Den Timeout f�r HTTP-Downloads
706:             */
707:            public int getHttpTimeoutSecs() {
708:                return mHttpTimeoutSecs;
709:            }
710:
711:            /**
712:             * Gibt zur�ck, ob URLs geladen werden sollen, die weder durchsucht noch
713:             * indiziert werden.
714:             *
715:             * @return Ob URLs geladen werden sollen, die weder durchsucht noch indiziert
716:             *         werden.
717:             */
718:            public boolean getLoadUnparsedUrls() {
719:                return mLoadUnparsedUrls;
720:            }
721:
722:            /**
723:             * Gibt zur�ck, ob ein Suchindex erstellt werden soll.
724:             *
725:             * @return Ob ein Suchindex erstellt werden soll.
726:             */
727:            public boolean getBuildIndex() {
728:                return mBuildIndex;
729:            }
730:
731:            /**
732:             * Gibt das Verzeichnis zur�ck, in dem der Suchindex am Ende stehen soll.
733:             *
734:             * @return Das Verzeichnis, in dem der Suchindex am Ende stehen soll.
735:             */
736:            public String getIndexDir() {
737:                return mIndexDir;
738:            }
739:
740:            /**
741:             * Gibt den zu verwendenden Analyzer-Typ zur�ck.
742:             *
743:             * @return en zu verwendenden Analyzer-Typ
744:             */
745:            public String getAnalyzerType() {
746:                return mAnalyzerType;
747:            }
748:
749:            // overridden
750:            public int getMaxFieldLength() {
751:                return mMaxFieldLength;
752:            }
753:
754:            /**
755:             * Gibt alle Worte zur�ck, die nicht indiziert werden sollen.
756:             *
757:             * @return Alle Worte, die nicht indiziert werden sollen.
758:             */
759:            public String[] getStopWordList() {
760:                return mStopWordList;
761:            }
762:
763:            /**
764:             * Gibt alle Worte zur�ck, die bei der Indizierung nicht vom Analyzer
765:             * ver�ndert werden sollen.
766:             *
767:             * @return Alle Worte, die bei der Indizierung nicht vom Analyzer
768:             *         ver�ndert werden sollen.
769:             */
770:            public String[] getExclusionList() {
771:                return mExclusionList;
772:            }
773:
774:            /**
775:             * Gibt zur�ck, ob Analyse-Deteien geschrieben werden sollen.
776:             * <p>
777:             * Diese Dateien helfen, die Qualit�t der Index-Erstellung zu pr�fen und
778:             * werden in einem Unterverzeichnis im Index-Verzeichnis angelegt.
779:             *
780:             * @return Ob Analyse-Deteien geschrieben werden sollen.
781:             */
782:            public boolean getWriteAnalysisFiles() {
783:                return mWriteAnalysisFiles;
784:            }
785:
786:            /**
787:             * Returns the interval between two breakpoint in minutes. If set to 0, no
788:             * breakpoints will be created.
789:             *
790:             * @return the interval between two breakpoint in minutes.
791:             */
792:            public int getBreakpointInterval() {
793:                return mBreakpointInterval;
794:            }
795:
796:            /**
797:             * Gibt den maximalen Prozentsatz von gescheiterten Dokumenten zur�ck. (0..1)
798:             * <p>
799:             * Ist das Verh�lnis von gescheiterten Dokumenten zur Gesamtzahl von
800:             * Dokumenten gr��er als dieser Prozentsatz, so wird der Index verworfen.
801:             * <p>
802:             * Gescheiterte Dokumente sind Dokumente die es entweder nicht gibt (Deadlink)
803:             * oder die nicht ausgelesen werden konnten.
804:             *
805:             * @return Den maximalen Prozentsatz von gescheiterten Dokumenten zur�ck.
806:             */
807:            public double getMaxFailedDocuments() {
808:                return mMaxFailedDocuments;
809:            }
810:
811:            /**
812:             * Gibt den Namen der Kontrolldatei f�r erfolgreiche Indexerstellung zur�ck.
813:             * <p>
814:             * Diese Datei wird erzeugt, wenn der Index erstellt wurde, ohne dass
815:             * fatale Fehler aufgetreten sind.
816:             * <p>
817:             * Wenn keine Kontrolldatei erzeugt werden soll, dann wird <code>null</code>
818:             * zur�ckgegeben.
819:             *
820:             * @return Der Name der Kontrolldatei f�r erfolgreiche Indexerstellung
821:             */
822:            public String getFinishedWithoutFatalsFileName() {
823:                return mFinishedWithoutFatalsFileName;
824:            }
825:
826:            /**
827:             * Gibt den Namen der Kontrolldatei f�r fehlerhafte Indexerstellung zur�ck.
828:             * <p>
829:             * Diese Datei wird erzeugt, wenn der Index erstellt wurde, wobei
830:             * fatale Fehler aufgetreten sind.
831:             * <p>
832:             * Wenn keine Kontrolldatei erzeugt werden soll, dann wird <code>null</code>
833:             * zur�ckgegeben.
834:             *
835:             * @return Der Name der Kontrolldatei f�r fehlerhafte Indexerstellung
836:             */
837:            public String getFinishedWithFatalsFileName() {
838:                return mFinishedWithFatalsFileName;
839:            }
840:
841:            /**
842:             * Gibt die StartUrls zur�ck, bei denen der Crawler-Proze� beginnen soll.
843:             *
844:             * @return Die StartUrls.
845:             */
846:            public StartUrl[] getStartUrls() {
847:                return mStartUrls;
848:            }
849:
850:            /**
851:             * Gibt die UrlPattern zur�ck, die der HTML-Parser nutzen soll, um URLs zu
852:             * identifizieren.
853:             *
854:             * @return Die UrlPattern f�r den HTML-Parser.
855:             */
856:            public UrlPattern[] getHtmlParserUrlPatterns() {
857:                return mHtmlParserUrlPatterns;
858:            }
859:
860:            /**
861:             * Gets the black list.
862:             * <p>
863:             * The black list is an array of UrlMatchers, a URLs <i>must not</i> match to,
864:             * in order to be processed.
865:             * 
866:             * @return The black list.
867:             */
868:            public UrlMatcher[] getBlackList() {
869:                return mBlackList;
870:            }
871:
872:            /**
873:             * Gets the white list.
874:             * <p>
875:             * The black list is an array of WhiteListEntry, a URLs <i>must</i> match to,
876:             * in order to be processed.
877:             *
878:             * @return Die Wei�e Liste
879:             */
880:            public WhiteListEntry[] getWhiteList() {
881:                return mWhiteListEntryArr;
882:            }
883:
884:            // overridden
885:            public String[] getValuePrefetchFields() {
886:                return mValuePrefetchFields;
887:            }
888:
889:            /**
890:             * Gibt die regul�ren Ausdr�cke zur�ck, auf die die URL eines Dokuments passen
891:             * muss, damit anstatt des wirklichen Dokumententitels der Text des Links, der
892:             * auf das Dokument gezeigt hat, als Dokumententitel genutzt wird.
893:             *
894:             * @return Die regul�ren Ausdr�cke, die Dokumente bestimmen, f�r die der
895:             *         Linktext als Titel genommen werden soll.
896:             */
897:            public String[] getUseLinkTextAsTitleRegexList() {
898:                return mUseLinkTextAsTitleRegexList;
899:            }
900:
901:            /**
902:             * Gets the list with the preparator settings.
903:             *
904:             * @return The list with the preparator settings.
905:             */
906:            public PreparatorSettings[] getPreparatorSettingsList() {
907:                return mPreparatorSettingsArr;
908:            }
909:
910:            /**
911:             * Gets the list of the auxiliary fields.
912:             * 
913:             * @return The list of the auxiliary fields. May be null.
914:             */
915:            public AuxiliaryField[] getAuxiliaryFieldList() {
916:                return mAuxiliaryFieldArr;
917:            }
918:
919:            /**
920:             * Gets the class name of the
921:             * {@link net.sf.regain.crawler.access.CrawlerAccessController} to use.
922:             * Returns <code>null</code> if no CrawlerAccessController should be used.
923:             * 
924:             * @return The class name of the CrawlerAccessController. 
925:             */
926:            public String getCrawlerAccessControllerClass() {
927:                return mCrawlerAccessControllerClass;
928:            }
929:
930:            /**
931:             * Gets the name of jar file to load the
932:             * {@link net.sf.regain.crawler.access.CrawlerAccessController} from.
933:             * Returns <code>null</code> if the CrawlerAccessController already is in the
934:             * classpath.
935:             * 
936:             * @return The name of jar file to load the CrawlerAccessController from. 
937:             */
938:            public String getCrawlerAccessControllerJar() {
939:                return mCrawlerAccessControllerJar;
940:            }
941:
942:            /**
943:             * Gets the configuration of the
944:             * {@link net.sf.regain.crawler.access.CrawlerAccessController}. May be
945:             * <code>null</code>.
946:             * 
947:             * @return The the configuration of the CrawlerAccessController. 
948:             */
949:            public Properties getCrawlerAccessControllerConfig() {
950:                return mCrawlerAccessControllerConfig;
951:            }
952:
953:        }
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.