001: /*
002: * $Id: Search.java,v 1.3 2004/10/10 14:21:29 csaltos Exp $
003: *
004: * Copyright 1999 PUCE [http://www.puce.edu.ec]
005: *
006: * Licensed under the Apache License, Version 2.0 (the "License");
007: * you may not use this file except in compliance with the License.
008: * You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing, software
013: * distributed under the License is distributed on an "AS IS" BASIS,
014: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015: * See the License for the specific language governing permissions and
016: * limitations under the License.
017: */
018: package org.oxyus.search;
019:
020: import java.io.IOException;
021: import java.util.ArrayList;
022: import java.util.Iterator;
023: import java.util.StringTokenizer;
024:
025: import org.apache.log4j.Logger;
026: import org.apache.lucene.analysis.Analyzer;
027: import org.apache.lucene.analysis.StopAnalyzer;
028: import org.apache.lucene.index.IndexReader;
029: import org.apache.lucene.queryParser.ParseException;
030: import org.apache.lucene.queryParser.QueryParser;
031: import org.apache.lucene.search.Hits;
032: import org.apache.lucene.search.IndexSearcher;
033: import org.apache.lucene.search.Query;
034: import org.oxyus.admin.Configuration;
035: import org.oxyus.admin.ConfigurationException;
036:
037: /**
038: * @author Carlos Saltos (csaltos[@]users.sourceforge.net)
039: */
040: public class Search {
041:
042: private Logger log;
043:
044: protected Hits hits = null;
045:
046: protected int totalDocumentsInIndex = 0;
047:
048: protected String queryString;
049:
050: // The number of charactes to collect for
051: // dynamic summaries when a query keyword
052: // ist detected in the document content
053: // TODO: get the summaryKeywordRatio and the max summary size
054: // from config
055: protected int summaryKeywordRatio = 50;
056:
057: protected int maxSummarySize = 250;
058:
059: public Search(String queryString) throws SearchException {
060: log = Logger.getLogger(Search.class);
061: // register the queryString
062: this .queryString = queryString;
063: // perform search based on the query string
064: search();
065: }
066:
067: protected void search() throws SearchException {
068: // Open access to the Apache Lucene Index
069: IndexSearcher indexSearcher = null;
070: IndexReader indexReader = null;
071: try {
072: indexReader = IndexReader.open(Configuration
073: .getIndexDirectory());
074: indexSearcher = new IndexSearcher(indexReader);
075: // register the total number of documents in the index
076: totalDocumentsInIndex = indexReader.numDocs();
077: } catch (IOException ioe) {
078: log.error("Unable to access the index", ioe);
079: throw new SearchException("Unable to access the index", ioe);
080: } catch (ConfigurationException ce) {
081: log.error("Unable to locate index directory", ce);
082: throw new SearchException(
083: "Unable to locate index directory", ce);
084: }
085: // Create a regular stop analyzer
086: // TODO: research about analyzers
087: Analyzer analyzer = new StopAnalyzer();
088: // Create and essemble an Apache Lucene Query
089: Query query = null;
090: try {
091: query = QueryParser
092: .parse(queryString, "contents", analyzer);
093: } catch (ParseException pe) {
094: log.error("Query sintax error", pe);
095: throw new SearchException("Query sintax error", pe);
096: }
097: Hits hits = null;
098: try {
099: hits = indexSearcher.search(query);
100: } catch (IOException ioe) {
101: log.error("Error searching the index", ioe);
102: throw new SearchException("Error searching the index", ioe);
103: }
104: // register the hits found
105: this .hits = hits;
106: }
107:
108: public String getQueryString() {
109: return queryString;
110: }
111:
112: public int getTotalDocumentsInIndex() throws SearchException {
113: return totalDocumentsInIndex;
114: }
115:
116: public int getTotalResults() throws SearchException {
117: return hits.length();
118: }
119:
120: public String getTitle(int index) throws SearchException {
121: String title = get(index, "title");
122: if (title == null) {
123: title = "No title";
124: }
125: return escape(title);
126: }
127:
128: public ArrayList getQueryKeywords(boolean inLowerCase) {
129: ArrayList keywords = new ArrayList();
130: StringTokenizer queryTokens = new StringTokenizer(
131: getQueryString());
132: while (queryTokens.hasMoreTokens()) {
133: String token = queryTokens.nextToken();
134: if (inLowerCase) {
135: // TODO: set locale for lowerCase()
136: token = token.toLowerCase();
137: }
138: if (!keywords.contains(token)) {
139: keywords.add(token);
140:
141: }
142: }
143: return keywords;
144: }
145:
146: /**
147: * looks for a word in a list of query keywords. NOTE: This method is case sensity
148: * @param queryKeywords an array list with the keywords used in the query.
149: * the keywords are normally lowercase
150: * @param word the word to verify it's a query keyword, usually lowercase
151: */
152: public boolean isQueryKeyword(ArrayList queryKeywords, String word) {
153: boolean isKeyword = false;
154: Iterator iterator = queryKeywords.iterator();
155: while (iterator.hasNext()) {
156: String queryKeyword = (String) iterator.next();
157: if (word.equals(queryKeyword)) {
158: isKeyword = true;
159: }
160: }
161: return isKeyword;
162: }
163:
164: public String getSummary(int index) throws SearchException {
165: // TODO: rename contents to content
166: // TODO: escape contents to UTF-8
167: String content = get(index, "contents");
168: if (content == null || content.equals("")) {
169: return "";
170: }
171: boolean firstChunk = true;
172: boolean recentTokensChunked = false;
173: StringBuffer summary = new StringBuffer(maxSummarySize);
174: ArrayList queryKeywords = getQueryKeywords(true);
175: StringTokenizer contentTokens = new StringTokenizer(content);
176: StringBuffer recentTokens = new StringBuffer(); //TODO: optimize initial size
177: while (contentTokens.hasMoreTokens()
178: && summary.length() < maxSummarySize) {
179: String contentToken = contentTokens.nextToken();
180: // TODO: set locale for lowerCase()
181: if (isQueryKeyword(queryKeywords, contentToken
182: .toLowerCase())) {
183: if (firstChunk && recentTokensChunked) {
184: summary.append("... ");
185: }
186: summary.append(recentTokens);
187: summary.append(contentToken + " ");
188: int added = 0, initialLength = summary.length();
189: while (contentTokens.hasMoreTokens()
190: && summary.length() < maxSummarySize
191: && added < summaryKeywordRatio) {
192: contentToken = contentTokens.nextToken();
193: summary.append(contentToken + " ");
194: added = summary.length() - initialLength;
195: }
196: if (contentTokens.hasMoreTokens()) {
197: summary.append("... ");
198: }
199: firstChunk = false;
200: // resets recent tokens
201: recentTokensChunked = false;
202: recentTokens.delete(0, recentTokens.length());
203: } else {
204: recentTokens.append(contentToken + " ");
205: if (recentTokens.length() > summaryKeywordRatio) {
206: int newStart = recentTokens.length()
207: - summaryKeywordRatio;
208: recentTokens.delete(0, newStart);
209: recentTokensChunked = true;
210: }
211: }
212: }
213: StringBuffer formattedSummary = new StringBuffer();
214: StringTokenizer summaryTokens = new StringTokenizer(summary
215: .toString());
216: while (summaryTokens.hasMoreTokens()) {
217: String token = summaryTokens.nextToken();
218: if (isQueryKeyword(queryKeywords, token.toLowerCase())) {
219: // TODO: add styles to the summary query keyords
220: formattedSummary
221: .append("<b>" + escape(token) + "</b> ");
222: } else {
223: formattedSummary.append(escape(token) + " ");
224: }
225: }
226: return formattedSummary.toString();
227: }
228:
229: public String getUrl(int index) throws SearchException {
230: return escape(get(index, "url"));
231: }
232:
233: public String getContent(int index) throws SearchException {
234: return escape(get(index, "contents"));
235: }
236:
237: public String escape(String value) {
238: if (value != null) {
239: value = value.replaceAll("<", "<");
240: value = value.replaceAll(">", ">");
241: }
242: return value;
243: }
244:
245: public String get(int index, String field) throws SearchException {
246: try {
247: return hits.doc(index).get(field);
248: } catch (IOException ioe) {
249: log.error("Unable to get " + field + " for document #"
250: + index, ioe);
251: throw new SearchException("Unable to get " + field
252: + " for document #" + index, ioe);
253: }
254: }
255:
256: }
|