001: package org.apache.lucene.analysis.ngram;
002:
003: /**
004: * Licensed to the Apache Software Foundation (ASF) under one or more
005: * contributor license agreements. See the NOTICE file distributed with
006: * this work for additional information regarding copyright ownership.
007: * The ASF licenses this file to You under the Apache License, Version 2.0
008: * (the "License"); you may not use this file except in compliance with
009: * the License. You may obtain a copy of the License at
010: *
011: * http://www.apache.org/licenses/LICENSE-2.0
012: *
013: * Unless required by applicable law or agreed to in writing, software
014: * distributed under the License is distributed on an "AS IS" BASIS,
015: * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
016: * See the License for the specific language governing permissions and
017: * limitations under the License.
018: */
019:
020: import org.apache.lucene.analysis.Token;
021: import org.apache.lucene.analysis.TokenFilter;
022: import org.apache.lucene.analysis.TokenStream;
023:
024: import java.io.IOException;
025: import java.util.LinkedList;
026:
027: /**
028: * Tokenizes the given token into n-grams of given size(s).
029: * @author Otis Gospodnetic
030: */
031: public class EdgeNGramTokenFilter extends TokenFilter {
032: public static final Side DEFAULT_SIDE = Side.FRONT;
033: public static final int DEFAULT_MAX_GRAM_SIZE = 1;
034: public static final int DEFAULT_MIN_GRAM_SIZE = 1;
035:
036: // Replace this with an enum when the Java 1.5 upgrade is made, the impl will be simplified
037: /** Specifies which side of the input the n-gram should be generated from */
038: public static class Side {
039: private String label;
040:
041: /** Get the n-gram from the front of the input */
042: public static Side FRONT = new Side("front");
043:
044: /** Get the n-gram from the end of the input */
045: public static Side BACK = new Side("back");
046:
047: // Private ctor
048: private Side(String label) {
049: this .label = label;
050: }
051:
052: public String getLabel() {
053: return label;
054: }
055:
056: // Get the appropriate Side from a string
057: public static Side getSide(String sideName) {
058: if (FRONT.getLabel().equals(sideName)) {
059: return FRONT;
060: } else if (BACK.getLabel().equals(sideName)) {
061: return BACK;
062: }
063: return null;
064: }
065: }
066:
067: private int minGram;
068: private int maxGram;
069: private Side side;
070: private LinkedList ngrams;
071:
072: protected EdgeNGramTokenFilter(TokenStream input) {
073: super (input);
074: this .ngrams = new LinkedList();
075: }
076:
077: /**
078: * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
079: *
080: * @param input TokenStream holding the input to be tokenized
081: * @param side the {@link Side} from which to chop off an n-gram
082: * @param minGram the smallest n-gram to generate
083: * @param maxGram the largest n-gram to generate
084: */
085: public EdgeNGramTokenFilter(TokenStream input, Side side,
086: int minGram, int maxGram) {
087: super (input);
088:
089: if (side == null) {
090: throw new IllegalArgumentException(
091: "sideLabel must be either front or back");
092: }
093:
094: if (minGram < 1) {
095: throw new IllegalArgumentException(
096: "minGram must be greater than zero");
097: }
098:
099: if (minGram > maxGram) {
100: throw new IllegalArgumentException(
101: "minGram must not be greater than maxGram");
102: }
103:
104: this .minGram = minGram;
105: this .maxGram = maxGram;
106: this .side = side;
107: this .ngrams = new LinkedList();
108: }
109:
110: /**
111: * Creates EdgeNGramTokenFilter that can generate n-grams in the sizes of the given range
112: *
113: * @param input TokenStream holding the input to be tokenized
114: * @param sideLabel the name of the {@link Side} from which to chop off an n-gram
115: * @param minGram the smallest n-gram to generate
116: * @param maxGram the largest n-gram to generate
117: */
118: public EdgeNGramTokenFilter(TokenStream input, String sideLabel,
119: int minGram, int maxGram) {
120: this (input, Side.getSide(sideLabel), minGram, maxGram);
121: }
122:
123: /** Returns the next token in the stream, or null at EOS. */
124: public final Token next() throws IOException {
125: if (ngrams.size() > 0) {
126: return (Token) ngrams.removeFirst();
127: }
128:
129: Token token = input.next();
130: if (token == null) {
131: return null;
132: }
133:
134: ngram(token);
135: if (ngrams.size() > 0)
136: return (Token) ngrams.removeFirst();
137: else
138: return null;
139: }
140:
141: private void ngram(Token token) {
142: String inStr = token.termText();
143: int inLen = inStr.length();
144: int gramSize = minGram;
145: while (gramSize <= maxGram) {
146: // if the remaining input is too short, we can't generate any n-grams
147: if (gramSize > inLen) {
148: return;
149: }
150:
151: // if we have hit the end of our n-gram size range, quit
152: if (gramSize > maxGram) {
153: return;
154: }
155:
156: Token tok;
157: if (side == Side.FRONT) {
158: tok = new Token(inStr.substring(0, gramSize), 0,
159: gramSize);
160: } else {
161: tok = new Token(inStr.substring(inLen - gramSize),
162: inLen - gramSize, inLen);
163: }
164: ngrams.add(tok);
165: gramSize++;
166: }
167: }
168: }
|