01: package org.contineo.core.text.analyze;
02:
03: import java.util.ArrayList;
04: import java.util.Collection;
05: import java.util.Enumeration;
06: import java.util.Hashtable;
07:
08: /**
09: * Provides functionality like getting the top most words in a document.
10: *
11: * @author Michael Scholz
12: */
13: public abstract class WordRanker {
14: protected Hashtable stoptable = new Hashtable();
15:
16: protected Hashtable<String, WordEntry> wordtable = new Hashtable<String, WordEntry>();
17:
18: protected long wordcount = 0;
19:
20: protected int minlen = 2;
21:
22: protected Entry getTopWord(Hashtable table) {
23: Entry entry = new Entry();
24: Enumeration enum1 = table.keys();
25: int topvalue = -1;
26: String topword = "";
27: String topOriginWord = "";
28:
29: while (enum1.hasMoreElements()) {
30: String key = (String) enum1.nextElement();
31: WordEntry termEntry = (WordEntry) table.get(key);
32: int val = termEntry.getValue();
33:
34: if (val > topvalue) {
35: topvalue = val;
36: topword = key;
37: topOriginWord = termEntry.getOriginWord();
38: }
39: }
40:
41: entry.setWord(topword);
42: entry.setNumber(topvalue);
43: entry.setOriginWord(topOriginWord);
44: return entry;
45: }
46:
47: /**
48: * Returns the top words of an analyzed document.
49: *
50: * @param hits - Number of top words to be returned.
51: * @return
52: */
53: public Collection getTopWords(int hits) {
54: Hashtable table = new Hashtable<String, WordEntry>(wordtable);
55: Collection<Entry> coll = new ArrayList<Entry>(hits);
56:
57: if (hits > table.size()) {
58: hits = table.size();
59: }
60:
61: for (int i = 0; i < hits; i++) {
62: Entry e = getTopWord(table);
63:
64: if (!e.getWord().equals("")) {
65: coll.add(e);
66: table.remove(e.getWord());
67: }
68: }
69:
70: return coll;
71: }
72:
73: /**
74: * @return Number of entries in the hitlist containing the topwords.
75: */
76: public int relevantWords() {
77: return wordtable.size();
78: }
79:
80: /**
81: * @return Number of words in the analyzed document.
82: */
83: public long getWordCount() {
84: return wordcount;
85: }
86:
87: public int getMinlen() {
88: return minlen;
89: }
90:
91: /**
92: * Sets the minimum length of words which should be analyzed.
93: *
94: * @param minlen
95: */
96: public void setMinlen(int minlen) {
97: this.minlen = minlen;
98: }
99: }
|