001: package prefuse.data.search;
002:
003: import java.util.Iterator;
004: import java.util.LinkedList;
005: import java.util.NoSuchElementException;
006:
007: import prefuse.data.Tuple;
008:
009: /**
010: * A trie data structure for fast-lookup of words based on their
011: * prefixes. The name "Trie" is a play on the words "tree" and
012: * "retrieval". This class builds a tree structure representing a set of
013: * words by their prefixes. It is useful for performing prefix-based
014: * searches over large amounts of text in an efficient manner.
015: *
016: * @version 1.0
017: * @author <a href="http://jheer.org">jeffrey heer</a>
018: * @see PrefixSearchTupleSet
019: */
020: public class Trie {
021:
022: /**
023: * Base class for nodes in the trie structure.
024: */
025: public class TrieNode {
026: boolean isLeaf;
027: int leafCount = 0;
028: }
029:
030: /**
031: * A TrieNode implementation representing a branch in the tree. The
032: * class maintains a list of characters (the next character in the
033: * prefix) and associated children TrieNodes for each.
034: */
035: public class TrieBranch extends TrieNode {
036: char[] chars = new char[] { 0 };
037: TrieNode[] children = new TrieNode[1];
038: }
039:
040: /**
041: * A TrieNode implementation representing a leaf in the tree. The class
042: * stores the word and tuple for the leaf, as well as a reference to the
043: * successor leaf node in the trie.
044: */
045: public class TrieLeaf extends TrieNode {
046: public TrieLeaf(String word, Tuple t) {
047: this .word = word;
048: tuple = t;
049: next = null;
050: leafCount = 1;
051: }
052:
053: String word;
054: Tuple tuple;
055: TrieLeaf next;
056: }
057:
058: /**
059: * An iterator for traversing a subtree of the Trie.
060: */
061: public class TrieIterator implements Iterator {
062: private LinkedList queue;
063:
064: public TrieIterator(TrieNode node) {
065: queue = new LinkedList();
066: queue.add(node);
067: }
068:
069: public boolean hasNext() {
070: return !queue.isEmpty();
071: }
072:
073: public Object next() {
074: if (queue.isEmpty())
075: throw new NoSuchElementException();
076:
077: TrieNode n = (TrieNode) queue.removeFirst();
078: Object o;
079: if (n instanceof TrieLeaf) {
080: TrieLeaf l = (TrieLeaf) n;
081: o = l.tuple;
082: if (l.next != null)
083: queue.addFirst(l.next);
084: return o;
085: } else {
086: TrieBranch b = (TrieBranch) n;
087: for (int i = b.chars.length - 1; i > 0; i--) {
088: queue.addFirst(b.children[i]);
089: }
090: if (b.children[0] != null)
091: queue.addFirst(b.children[0]);
092: return next();
093: }
094: }
095:
096: public void remove() {
097: throw new UnsupportedOperationException();
098: }
099: } // end of inner clas TrieIterator
100:
101: private TrieBranch root = new TrieBranch();
102: private boolean caseSensitive = false;
103:
104: /**
105: * Create a new Trie with the specified case-sensitivity.
106: * @param caseSensitive true if the index should be case sensitive for
107: * indexed words, false otherwise.
108: */
109: public Trie(boolean caseSensitive) {
110: this .caseSensitive = caseSensitive;
111: }
112:
113: /**
114: * Indicates if this Trie's index takes the case of letters
115: * into account.
116: * @return true if the index is case-sensitive, false otherwise
117: */
118: public boolean isCaseSensitive() {
119: return caseSensitive;
120: }
121:
122: /**
123: * Add a new word to the trie, associated with the given Tuple.
124: * @param word the word to add to the Trie
125: * @param t the Tuple associated with the word
126: */
127: public void addString(String word, Tuple t) {
128: TrieLeaf leaf = new TrieLeaf(word, t);
129: addLeaf(root, leaf, 0);
130: }
131:
132: /**
133: * Remove a word/Tuple pair from the trie.
134: * @param word the word to remove
135: * @param t the associate Tuple to remove
136: */
137: public void removeString(String word, Tuple t) {
138: removeLeaf(root, word, t, 0);
139: }
140:
141: private final int getIndex(char[] chars, char c) {
142: for (int i = 0; i < chars.length; i++)
143: if (chars[i] == c)
144: return i;
145: return -1;
146: }
147:
148: private final char getChar(String s, int i) {
149: char c = (i < 0 || i >= s.length() ? 0 : s.charAt(i));
150: return (caseSensitive ? c : Character.toLowerCase(c));
151: }
152:
153: private final TrieNode equalityCheck(String word, TrieLeaf l) {
154: if (caseSensitive) {
155: return l.word.startsWith(word) ? l : null;
156: } else {
157: // do our own looping to avoid string allocation for case change
158: int len = word.length();
159: if (len > l.word.length())
160: return null;
161: for (int i = 0; i < len; ++i) {
162: char c1 = Character.toLowerCase(word.charAt(i));
163: char c2 = Character.toLowerCase(l.word.charAt(i));
164: if (c1 != c2)
165: return null;
166: }
167: return l;
168: }
169: }
170:
171: private boolean removeLeaf(TrieBranch b, String word, Tuple t,
172: int depth) {
173: char c = getChar(word, depth);
174: int i = getIndex(b.chars, c);
175:
176: if (i == -1) {
177: // couldn't find leaf
178: return false;
179: } else {
180: TrieNode n = b.children[i];
181: if (n instanceof TrieBranch) {
182: TrieBranch tb = (TrieBranch) n;
183: boolean rem = removeLeaf(tb, word, t, depth + 1);
184: if (rem) {
185: b.leafCount--;
186: if (tb.leafCount == 1)
187: b.children[i] = tb.children[tb.children[0] != null ? 0
188: : 1];
189: }
190: return rem;
191: } else {
192: TrieLeaf nl = (TrieLeaf) n;
193: if (nl.tuple == t) {
194: b.children[i] = nl.next;
195: if (nl.next == null)
196: repairBranch(b, i);
197: b.leafCount--;
198: return true;
199: } else {
200: TrieLeaf nnl = nl.next;
201: while (nnl != null && nnl.tuple != t) {
202: nl = nnl;
203: nnl = nnl.next;
204: }
205: if (nnl == null)
206: return false; // couldn't find leaf
207:
208: // update leaf counts
209: for (TrieLeaf tl = (TrieLeaf) n; tl.tuple != t; tl = tl.next)
210: tl.leafCount--;
211:
212: nl.next = nnl.next;
213: b.leafCount--;
214: return true;
215: }
216: }
217: }
218: }
219:
220: private void repairBranch(TrieBranch b, int i) {
221: if (i == 0) {
222: b.children[0] = null;
223: } else {
224: int len = b.chars.length;
225: char[] nchars = new char[len - 1];
226: TrieNode[] nkids = new TrieNode[len - 1];
227: System.arraycopy(b.chars, 0, nchars, 0, i);
228: System.arraycopy(b.children, 0, nkids, 0, i);
229: System.arraycopy(b.chars, i + 1, nchars, i, len - i - 1);
230: System.arraycopy(b.children, i + 1, nkids, i, len - i - 1);
231: b.chars = nchars;
232: b.children = nkids;
233: }
234: }
235:
236: private void addLeaf(TrieBranch b, TrieLeaf l, int depth) {
237: b.leafCount += l.leafCount;
238:
239: char c = getChar(l.word, depth);
240: int i = getIndex(b.chars, c);
241:
242: if (i == -1) {
243: addChild(b, l, c);
244: } else {
245: TrieNode n = b.children[i];
246: if (n == null) {
247: // we have completely spelled out the word
248: b.children[i] = l;
249: } else if (n instanceof TrieBranch) {
250: // recurse down the tree
251: addLeaf((TrieBranch) n, l, depth + 1);
252: } else {
253: // node is a leaf, need to do a split?
254: TrieLeaf nl = (TrieLeaf) n;
255: if (i == 0
256: || (caseSensitive ? nl.word.equals(l.word)
257: : nl.word.equalsIgnoreCase(l.word))) {
258: // same word, so chain the entries
259: for (; nl.next != null; nl = nl.next)
260: nl.leafCount++;
261: nl.leafCount++;
262: nl.next = l;
263: } else {
264: // different words, need to do a split
265: TrieBranch nb = new TrieBranch();
266: b.children[i] = nb;
267: addLeaf(nb, nl, depth + 1);
268: addLeaf(nb, l, depth + 1);
269: }
270: }
271: }
272: }
273:
274: private void addChild(TrieBranch b, TrieNode n, char c) {
275: int len = b.chars.length;
276: char[] nchars = new char[len + 1];
277: TrieNode[] nkids = new TrieNode[len + 1];
278: System.arraycopy(b.chars, 0, nchars, 0, len);
279: System.arraycopy(b.children, 0, nkids, 0, len);
280: nchars[len] = c;
281: nkids[len] = n;
282: b.chars = nchars;
283: b.children = nkids;
284: }
285:
286: /**
287: * Look up the given word in this Trie. If a match is found, a TrieNode
288: * is returned. This node is the root of a subtree containing all the
289: * matches to the query.
290: * @param word the word to lookup
291: * @return the TrieNode root of the subtree containing all matches. A
292: * null value is returned if no match is found.
293: */
294: public TrieNode find(String word) {
295: return (word.length() < 1 ? null : find(word, root, 0));
296: }
297:
298: private TrieNode find(String word, TrieBranch b, int depth) {
299: char c = getChar(word, depth);
300: int i = getIndex(b.chars, c);
301: if (i == -1) {
302: return null; // not in trie
303: } else if (word.length() - 1 == depth) {
304: return b.children[i]; // end of search
305: } else if (b.children[i] instanceof TrieLeaf) {
306: return equalityCheck(word, (TrieLeaf) b.children[i]);
307: } else {
308: return find(word, (TrieBranch) b.children[i], depth + 1); // recurse
309: }
310: }
311:
312: } // end of class Trie
|