001: /*******************************************************************************
002: * Licensed to the Apache Software Foundation (ASF) under one
003: * or more contributor license agreements. See the NOTICE file
004: * distributed with this work for additional information
005: * regarding copyright ownership. The ASF licenses this file
006: * to you under the Apache License, Version 2.0 (the
007: * "License"); you may not use this file except in compliance
008: * with the License. You may obtain a copy of the License at
009: *
010: * http://www.apache.org/licenses/LICENSE-2.0
011: *
012: * Unless required by applicable law or agreed to in writing,
013: * software distributed under the License is distributed on an
014: * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
015: * KIND, either express or implied. See the License for the
016: * specific language governing permissions and limitations
017: * under the License.
018: *******************************************************************************/package org.ofbiz.common;
019:
020: import java.util.HashMap;
021: import java.util.HashSet;
022: import java.util.Iterator;
023: import java.util.List;
024: import java.util.Map;
025: import java.util.Set;
026: import java.util.StringTokenizer;
027: import java.util.TreeSet;
028:
029: import org.ofbiz.base.util.Debug;
030: import org.ofbiz.base.util.UtilMisc;
031: import org.ofbiz.base.util.UtilProperties;
032: import org.ofbiz.base.util.UtilValidate;
033: import org.ofbiz.entity.GenericDelegator;
034: import org.ofbiz.entity.GenericEntityException;
035: import org.ofbiz.entity.GenericValue;
036:
037: /**
038: * A few utility methods related to Keyword Search.
039: */
040: public class KeywordSearchUtil {
041:
042: public static final String module = KeywordSearchUtil.class
043: .getName();
044:
045: public static Set thesaurusRelsToInclude = new HashSet();
046: public static Set thesaurusRelsForReplace = new HashSet();
047:
048: static {
049: thesaurusRelsToInclude.add("KWTR_UF");
050: thesaurusRelsToInclude.add("KWTR_USE");
051: thesaurusRelsToInclude.add("KWTR_CS");
052: thesaurusRelsToInclude.add("KWTR_NT");
053: thesaurusRelsToInclude.add("KWTR_BT");
054: thesaurusRelsToInclude.add("KWTR_RT");
055:
056: thesaurusRelsForReplace.add("KWTR_USE");
057: thesaurusRelsForReplace.add("KWTR_CS");
058: }
059:
060: public static String getSeparators() {
061: // String separators = ";: ,.!?\t\"\'\r\n\\/()[]{}*%<>-+_";
062: String seps = UtilProperties.getPropertyValue("keywordsearch",
063: "index.keyword.separators",
064: ";: ,.!?\t\"\'\r\n\\/()[]{}*%<>-+_");
065: return seps;
066: }
067:
068: public static String getStopWordBagOr() {
069: return UtilProperties.getPropertyValue("keywordsearch",
070: "stop.word.bag.or");
071: }
072:
073: public static String getStopWordBagAnd() {
074: return UtilProperties.getPropertyValue("keywordsearch",
075: "stop.word.bag.and");
076: }
077:
078: public static boolean getRemoveStems() {
079: String removeStemsStr = UtilProperties.getPropertyValue(
080: "keywordsearch", "remove.stems");
081: return "true".equals(removeStemsStr);
082: }
083:
084: public static Set getStemSet() {
085: String stemBag = UtilProperties.getPropertyValue(
086: "keywordsearch", "stem.bag");
087: Set stemSet = new TreeSet();
088: if (UtilValidate.isNotEmpty(stemBag)) {
089: String curToken;
090: StringTokenizer tokenizer = new StringTokenizer(stemBag,
091: ": ");
092: while (tokenizer.hasMoreTokens()) {
093: curToken = tokenizer.nextToken();
094: stemSet.add(curToken);
095: }
096: }
097: return stemSet;
098: }
099:
100: public static void processForKeywords(String str, Map keywords,
101: boolean forSearch, boolean anyPrefix, boolean anySuffix,
102: boolean isAnd) {
103: String separators = getSeparators();
104: String stopWordBagOr = getStopWordBagOr();
105: String stopWordBagAnd = getStopWordBagAnd();
106:
107: boolean removeStems = getRemoveStems();
108: Set stemSet = getStemSet();
109:
110: processForKeywords(str, keywords, separators, stopWordBagAnd,
111: stopWordBagOr, removeStems, stemSet, forSearch,
112: anyPrefix, anySuffix, isAnd);
113: }
114:
115: public static void processKeywordsForIndex(String str,
116: Map keywords, String separators, String stopWordBagAnd,
117: String stopWordBagOr, boolean removeStems, Set stemSet) {
118: processForKeywords(str, keywords, separators, stopWordBagAnd,
119: stopWordBagOr, removeStems, stemSet, false, false,
120: false, false);
121: }
122:
123: public static void processForKeywords(String str, Map keywords,
124: String separators, String stopWordBagAnd,
125: String stopWordBagOr, boolean removeStems, Set stemSet,
126: boolean forSearch, boolean anyPrefix, boolean anySuffix,
127: boolean isAnd) {
128: Set keywordSet = makeKeywordSet(str, separators, forSearch);
129: fixupKeywordSet(keywordSet, keywords, stopWordBagAnd,
130: stopWordBagOr, removeStems, stemSet, forSearch,
131: anyPrefix, anySuffix, isAnd);
132: }
133:
134: public static void fixupKeywordSet(Set keywordSet, Map keywords,
135: String stopWordBagAnd, String stopWordBagOr,
136: boolean removeStems, Set stemSet, boolean forSearch,
137: boolean anyPrefix, boolean anySuffix, boolean isAnd) {
138: if (keywordSet == null) {
139: return;
140: }
141:
142: Iterator keywordIter = keywordSet.iterator();
143: while (keywordIter.hasNext()) {
144: String token = (String) keywordIter.next();
145:
146: // when cleaning up the tokens the ordering is inportant: check stop words, remove stems, then get rid of 1 character tokens (1 digit okay)
147:
148: // check stop words
149: String colonToken = ":" + token + ":";
150: if (forSearch) {
151: if ((isAnd && stopWordBagAnd.indexOf(colonToken) >= 0)
152: || (!isAnd && stopWordBagOr.indexOf(colonToken) >= 0)) {
153: continue;
154: }
155: } else {
156: if (stopWordBagOr.indexOf(colonToken) >= 0
157: && stopWordBagAnd.indexOf(colonToken) >= 0) {
158: continue;
159: }
160: }
161:
162: // remove stems
163: if (removeStems) {
164: Iterator stemIter = stemSet.iterator();
165: while (stemIter.hasNext()) {
166: String stem = (String) stemIter.next();
167: if (token.endsWith(stem)) {
168: token = token.substring(0, token.length()
169: - stem.length());
170: }
171: }
172: }
173:
174: // get rid of all length 0 tokens now
175: if (token.length() == 0) {
176: continue;
177: }
178:
179: // get rid of all length 1 character only tokens, pretty much useless
180: if (token.length() == 1
181: && Character.isLetter(token.charAt(0))) {
182: continue;
183: }
184:
185: if (forSearch) {
186: StringBuffer strSb = new StringBuffer();
187: if (anyPrefix)
188: strSb.append('%');
189: strSb.append(token);
190: if (anySuffix)
191: strSb.append('%');
192: // replace all %% with %
193: int dblPercIdx = -1;
194: while ((dblPercIdx = strSb.indexOf("%%")) >= 0) {
195: //Debug.logInfo("before strSb: " + strSb, module);
196: strSb.replace(dblPercIdx, dblPercIdx + 2, "%");
197: //Debug.logInfo("after strSb: " + strSb, module);
198: }
199: token = strSb.toString();
200: }
201:
202: // group by word, add up weight
203: Long curWeight = (Long) keywords.get(token);
204: if (curWeight == null) {
205: keywords.put(token, new Long(1));
206: } else {
207: keywords
208: .put(token, new Long(curWeight.longValue() + 1));
209: }
210: }
211: }
212:
213: public static Set makeKeywordSet(String str, String separators,
214: boolean forSearch) {
215: if (separators == null)
216: separators = getSeparators();
217:
218: Set keywords = new TreeSet();
219: if (str.length() > 0) {
220: // strip off weird characters
221: str = str.replaceAll("\\\302\\\240|\\\240", " ");
222:
223: if (forSearch) {
224: // remove %_*? from separators if is for a search
225: StringBuffer sb = new StringBuffer(separators);
226: if (sb.indexOf("%") >= 0)
227: sb.deleteCharAt(sb.indexOf("%"));
228: if (sb.indexOf("_") >= 0)
229: sb.deleteCharAt(sb.indexOf("_"));
230: if (sb.indexOf("*") >= 0)
231: sb.deleteCharAt(sb.indexOf("*"));
232: if (sb.indexOf("?") >= 0)
233: sb.deleteCharAt(sb.indexOf("?"));
234: separators = sb.toString();
235: }
236:
237: StringTokenizer tokener = new StringTokenizer(str,
238: separators, false);
239: while (tokener.hasMoreTokens()) {
240: // make sure it is lower case before doing anything else
241: String token = tokener.nextToken().toLowerCase();
242:
243: if (forSearch) {
244: // these characters will only be present if it is for a search, ie not for indexing
245: token = token.replace('*', '%');
246: token = token.replace('?', '_');
247: }
248:
249: keywords.add(token);
250: }
251: }
252: return keywords;
253: }
254:
255: public static Set fixKeywordsForSearch(Set keywordSet,
256: boolean anyPrefix, boolean anySuffix, boolean removeStems,
257: boolean isAnd) {
258: Map keywords = new HashMap();
259: fixupKeywordSet(keywordSet, keywords, getStopWordBagAnd(),
260: getStopWordBagOr(), removeStems, getStemSet(), true,
261: anyPrefix, anySuffix, isAnd);
262: return keywords.keySet();
263: }
264:
265: public static boolean expandKeywordForSearch(String enteredKeyword,
266: Set addToSet, GenericDelegator delegator) {
267: boolean replaceEnteredKeyword = false;
268:
269: try {
270: List thesaurusList = delegator.findByAndCache(
271: "KeywordThesaurus", UtilMisc.toMap(
272: "enteredKeyword", enteredKeyword));
273: Iterator thesaurusIter = thesaurusList.iterator();
274: while (thesaurusIter.hasNext()) {
275: GenericValue keywordThesaurus = (GenericValue) thesaurusIter
276: .next();
277: String relationshipEnumId = (String) keywordThesaurus
278: .get("relationshipEnumId");
279: if (thesaurusRelsToInclude.contains(relationshipEnumId)) {
280: addToSet
281: .addAll(makeKeywordSet(keywordThesaurus
282: .getString("alternateKeyword"),
283: null, true));
284: if (thesaurusRelsForReplace
285: .contains(relationshipEnumId)) {
286: replaceEnteredKeyword = true;
287: }
288: }
289: }
290: } catch (GenericEntityException e) {
291: Debug
292: .logError(e, "Error expanding entered keyword",
293: module);
294: }
295:
296: Debug.logInfo("Expanded keyword [" + enteredKeyword
297: + "], got set: " + addToSet, module);
298: return replaceEnteredKeyword;
299: }
300: }
|