001: package org.contineo.core.text.analyze;
002:
003: import java.io.BufferedInputStream;
004: import java.io.File;
005: import java.io.FileInputStream;
006: import java.net.URLDecoder;
007: import java.util.Collection;
008: import java.util.Hashtable;
009: import java.util.Iterator;
010:
011: import junit.framework.TestCase;
012:
013: public class AnalyzerTest extends TestCase {
014:
015: public void testAnalyzerStringInt() {
016: Analyzer analyzer = new Analyzer("en", 4);
017: assertNotNull(analyzer);
018: assertEquals(4, analyzer.minlen);
019: assertEquals("en", analyzer.getLanguage());
020:
021: analyzer = new Analyzer("de", 5);
022: assertEquals(5, analyzer.minlen);
023: assertEquals("de", analyzer.getLanguage());
024: }
025:
026: public void testAnalizeItalian() throws Exception {
027: Analyzer analyzer = new Analyzer("it", 4);
028: // Text with a lot of words
029: String textToAnalize = "Festival di Torino, Moretti rinuncia \"Vi lascio ai vostri rancori personali\""
030: + " Il regista era stato nominato direttore artistico. Ma gli ideatori hanno polemizzato perché "
031: + "l'organizzazione era stata affidata al Museo del Cinema. Una lettera per l'addio. Chiamparino: "
032: + "\"Spero che ci ripensi\"."
033: + " Colpo di scena nella querelle sulla direzione del Torino Film Festival. Il regista Nanni Moretti, che due "
034: + "giorni fa aveva accettato l'incarico offerto dagli enti locali e dal Museo Nazionale del Cinema, l'ente "
035: + "organizzatore dell'edizione 2007 in programma nel prossimo novembre, sbatte la porta e se ne va: "
036: + "\"Con molto dolore rinuncio all'incarico e vi lascio ai vostri problemi di metodo, ai contrasti "
037: + "procedurali, ai rancori personali\", dice in una nota affidata in serata all' Ansa."
038: + "Il sistema di archiviazione Contineo è sviluppato in Java";
039:
040: analyzer.analyze(textToAnalize);
041:
042: long wordCount = analyzer.getWordCount();
043: System.out.println("wordCount = " + wordCount);
044: assertTrue(wordCount > 0);
045:
046: // Check the content of wordtable
047: Hashtable<String, WordEntry> wordtable = analyzer.wordtable;
048: assertNotNull(wordtable);
049: assertFalse(wordtable.isEmpty());
050:
051: System.out.println("size = " + wordtable.size());
052: for (String key : wordtable.keySet()) {
053: System.out.println(key);
054: }
055:
056: Collection coll = analyzer.getTopWords(15);
057: assertNotNull(coll);
058: assertTrue(coll.size() == 15);
059:
060: int countWord01 = 0;
061: int countWord02 = 0;
062: // The top words must contains the stem "morett", the word appears 2
063: // times in the text
064: for (Iterator iter = coll.iterator(); iter.hasNext();) {
065: Entry e = (Entry) iter.next();
066: System.out.println(e.getNumber());
067: System.out.println(e.getWord());
068: System.out.println(e.getOriginWord());
069: if (e.getWord().equals("morett"))
070: countWord01 = e.getNumber();
071: if (e.getWord().equals("rancor"))
072: countWord02 = e.getNumber();
073: }
074: assertTrue(2 == countWord01);
075: assertTrue(2 == countWord02);
076: }
077:
078: public void testAnalizeEnglish() throws Exception {
079: Analyzer analyzer = new Analyzer("en", 4);
080:
081: File file = new File(URLDecoder.decode(getClass()
082: .getClassLoader().getResource("homelidays_vision.txt")
083: .getPath(), "UTF-8"));
084: BufferedInputStream bis = new BufferedInputStream(
085: new FileInputStream(file));
086: StringBuffer content = new StringBuffer();
087: int ichar = 0;
088: while ((ichar = bis.read()) > 0) {
089: content.append((char) ichar);
090: }
091:
092: // Text with a lot of words
093: String textToAnalize = content.toString();
094:
095: analyzer.analyze(textToAnalize);
096:
097: long wordCount = analyzer.getWordCount();
098: System.out.println("wordCount = " + wordCount);
099: assertTrue(wordCount > 0);
100:
101: // Check the content of wordtable
102: Hashtable<String, WordEntry> wordtable = analyzer.wordtable;
103: assertNotNull(wordtable);
104: assertFalse(wordtable.isEmpty());
105:
106: System.out.println("size = " + wordtable.size());
107: for (String key : wordtable.keySet()) {
108: System.out.println(key);
109: }
110:
111: Collection coll = analyzer.getTopWords(10);
112: assertNotNull(coll);
113: assertTrue(coll.size() == 10);
114:
115: int countWord01 = 0;
116: int countWord02 = 0;
117: // The top words must contains the stem "holiday", the word appears 5
118: // times in the text
119: for (Iterator iter = coll.iterator(); iter.hasNext();) {
120: Entry e = (Entry) iter.next();
121: System.out.println(e.getNumber());
122: System.out.println(e.getWord());
123: System.out.println(e.getOriginWord());
124: if (e.getWord().equals("holiday"))
125: countWord01 = e.getNumber();
126: if (e.getWord().equals("accommod"))
127: countWord02 = e.getNumber();
128: }
129: assertTrue(5 == countWord01);
130: assertTrue(5 == countWord02);
131: }
132:
133: }
|