001: /*
002: * CVS information:
003: * $RCSfile$
004: * $Source$
005: * $Date: 2006-01-17 11:50:42 +0100 (Di, 17 Jan 2006) $
006: * $Author: til132 $
007: * $Revision: 189 $
008: */
009: package net.sf.regain.crawler.config;
010:
011: import net.sf.regain.RegainException;
012:
013: import org.apache.regexp.RE;
014:
015: /**
016: * An auxiliary field is a additional field put into the index.
017: * <p>
018: * Example: If you have a directory with a sub directory for every project,
019: * then you may create a field with the project's name.
020: * <p>
021: * The folling rule will create a field "project" with the value "otto23"
022: * from the URL "file://c:/projects/otto23/docs/Spez.doc":
023: * <code>new AuxiliaryField("project", "^file://c:/projects/([^/]*)", 1)</code>
024: * <p>
025: * URLs that doen't match will get no "project" field.
026: * <p>
027: * Having done this you may search for "Offer project:otto23" and you will get
028: * only hits from this project directory.
029: *
030: * @author Tilman Schneider, STZ-IDA an der FH Karlsruhe
031: */
032: public class AuxiliaryField {
033:
034: /** The name of the auxiliary field. */
035: private String mFieldName;
036:
037: /**
038: * The value of the auxiliary field. If null, the value will be extracted from
039: * the regex using the urlRegexGroup.
040: */
041: private String mValue;
042:
043: /** Specifies whether the (extracted) value should be converted to lower case. */
044: private boolean mToLowerCase;
045:
046: /** The regex that extracts the value of the field. */
047: private RE mUrlRegex;
048:
049: /** The group of the regex that contains the value. */
050: private int mUrlRegexGroup;
051:
052: /** Specifies whether the field value should be stored in the index. */
053: private boolean mStore;
054:
055: /** Specifies whether the field value should be indexed. */
056: private boolean mIndex;
057:
058: /** Specifies whether the field value should be tokenized. */
059: private boolean mTokenize;
060:
061: /**
062: * Creates a new instance of AuxiliaryField.
063: *
064: * @param fieldName The name of the auxiliary field.
065: * @param value The value of the auxiliary field. If null, the value will be
066: * extracted from the regex using the urlRegexGroup.
067: * @param toLowerCase Whether the (extracted) value should be converted to
068: * lower case.
069: * @param urlRegex The regex that extracts the value of the field.
070: * @param urlRegexGroup The group of the regex that contains the value.
071: * @param store Specifies whether the field value should be stored in the
072: * index.
073: * @param index Specifies whether the field value should be indexed.
074: * @param tokenize Specifies whether the field value should be tokenized.
075: *
076: * @throws RegainException If the regex has a syntax error.
077: */
078: public AuxiliaryField(String fieldName, String value,
079: boolean toLowerCase, RE urlRegex, int urlRegexGroup,
080: boolean store, boolean index, boolean tokenize)
081: throws RegainException {
082: mFieldName = fieldName;
083: mValue = value;
084: mToLowerCase = toLowerCase;
085: mUrlRegex = urlRegex;
086: mUrlRegexGroup = urlRegexGroup;
087: mStore = store;
088: mIndex = index;
089: mTokenize = tokenize;
090: }
091:
092: /**
093: * Gets the name of the auxiliary field.
094: *
095: * @return The name of the auxiliary field.
096: */
097: public String getFieldName() {
098: return mFieldName;
099: }
100:
101: /**
102: * Returns the value of the auxiliary field.
103: * <p>
104: * If null, the value will be extracted from the regex using the urlRegexGroup.
105: *
106: * @return The value of the auxiliary field.
107: */
108: public String getValue() {
109: return mValue;
110: }
111:
112: /**
113: * Returns whether the (extracted) value should be converted to lower case.
114: *
115: * @return Whether the (extracted) value should be converted to lower case.
116: */
117: public boolean getToLowerCase() {
118: return mToLowerCase;
119: }
120:
121: /**
122: * Gets the regex that extracts the value of the field.
123: *
124: * @return The regex that extracts the value of the field.
125: */
126: public RE getUrlRegex() {
127: return mUrlRegex;
128: }
129:
130: /**
131: * Gets the group of the regex that contains the value.
132: *
133: * @return The group of the regex that contains the value.
134: */
135: public int getUrlRegexGroup() {
136: return mUrlRegexGroup;
137: }
138:
139: /**
140: * Returns whether the field value should be stored in the index.
141: *
142: * @return whether the field value should be stored in the index.
143: */
144: public boolean isStored() {
145: return mStore;
146: }
147:
148: /**
149: * Returns whether the field value should be indexed.
150: *
151: * @return whether the field value should be indexed.
152: */
153: public boolean isIndexed() {
154: return mIndex;
155: }
156:
157: /**
158: * Returns whether the field value should be tokenized.
159: *
160: * @return whether the field value should be tokenized.
161: */
162: public boolean isTokenized() {
163: return mTokenize;
164: }
165:
166: }
|