001: package org.contineo.core.text.analyze;
002:
003: import java.lang.reflect.InvocationTargetException;
004: import java.text.BreakIterator;
005: import java.util.Hashtable;
006:
007: import org.apache.commons.logging.Log;
008: import org.apache.commons.logging.LogFactory;
009:
010: /**
011: * @author Michael Scholz
012: * @author Alessandro Gasparini
013: */
014: public class Analyzer extends WordRanker {
015:
016: protected static Log log = LogFactory.getLog(Analyzer.class);
017:
018: private String[] stopwords;
019:
020: private String language;
021:
022: /**
023: * Creates a new instance of Analyzer.
024: *
025: * @param language Two characters language ISO 639-1
026: * @param stopwords Array of user specific stop words.
027: */
028: Analyzer(String language, String[] stopwords) {
029: this .language = language;
030: stoptable = StopTable.setStopWords(stopwords);
031: }
032:
033: /**
034: * Creates a new instance of Analyzer.
035: *
036: * @param language Two characters language ISO 639-1
037: * @param len Minimum length of words which should analyzed.
038: */
039: Analyzer(String language, int len) {
040: this .language = language;
041: minlen = len;
042: stopwords = Stopwords.getStopwords(language);
043: stoptable = StopTable.setStopWords(stopwords);
044: }
045:
046: /**
047: * This method analyzes a given text an fills a hitlist.
048: *
049: * @param text Text which should analyzed.
050: * @throws Exception In case of exception during Stemmer instantation
051: */
052: public void analyze(String text) throws Exception {
053: BreakIterator boundary = BreakIterator.getWordInstance();
054: boundary.setText(text);
055:
056: Stemmer stemmer = null;
057: try {
058: stemmer = new Stemmer(language);
059: } catch (Exception e) {
060: log.error("Unable to instantiate a Stemmer for language "
061: + language, e);
062: throw e;
063: }
064: AnalyseResult result = performAnalysis(boundary,
065: new StringBuffer(text), stoptable, minlen, stemmer);
066: wordcount = result.getWordCount();
067: wordtable = result.getWordTable();
068: }
069:
070: public String getLanguage() {
071: return language;
072: }
073:
074: /**
075: * Analyses a text and builds a table with each unique word stem,
076: * number of stem presence in the text and original word.
077: */
078: AnalyseResult performAnalysis(BreakIterator boundary,
079: StringBuffer source, Hashtable stopwords, int minlen,
080: Stemmer stemmer) throws IllegalArgumentException,
081: IllegalAccessException, InvocationTargetException {
082:
083: int start = boundary.first();
084: long wordcount = 0;
085: AnalyseResult result = new AnalyseResult();
086: Hashtable<String, WordEntry> wordtable = new Hashtable<String, WordEntry>(
087: source.length() / 6);
088:
089: for (int end = boundary.next(); end != BreakIterator.DONE; start = end, end = boundary
090: .next()) {
091:
092: String word = source.substring(start, end).trim();
093: char next = ' ';
094: try {
095: next = source.charAt(end);
096: } catch (Exception e) {
097: }
098:
099: if (word.length() > minlen) {
100: String stem = stemmer.stem(word);
101: WordEntry entry = new WordEntry();
102:
103: if ((word.length() >= minlen)
104: && !stopwords.containsKey(word)
105: && !stopwords.containsKey(stem)) {
106: wordcount++;
107:
108: if (wordtable.containsKey(stem)) {
109: entry = (WordEntry) wordtable.get(stem);
110: entry.incValue();
111:
112: if ((word.length() < entry.getOriginWord()
113: .length())
114: && (next != (char) 45)) {
115: entry.setOriginWord(word);
116: }
117: } else {
118: entry.incValue();
119: entry.setOriginWord(word);
120: }
121: wordtable.put(stem, entry);
122: }
123: }
124: }
125:
126: result.setWordCount(wordcount);
127: result.setWordTable(wordtable);
128:
129: return result;
130: }
131:
132: }
|