001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * ComplementNaiveBayes.java
019: * Copyright (C) 2003 University of Waikato, Hamilton, New Zealand
020: */
021:
022: package weka.classifiers.bayes;
023:
024: import weka.classifiers.Classifier;
025: import weka.core.Capabilities;
026: import weka.core.FastVector;
027: import weka.core.Instance;
028: import weka.core.Instances;
029: import weka.core.Option;
030: import weka.core.OptionHandler;
031: import weka.core.TechnicalInformation;
032: import weka.core.TechnicalInformationHandler;
033: import weka.core.Utils;
034: import weka.core.WeightedInstancesHandler;
035: import weka.core.Capabilities.Capability;
036: import weka.core.TechnicalInformation.Field;
037: import weka.core.TechnicalInformation.Type;
038:
039: /**
040: <!-- globalinfo-start -->
041: * Class for building and using a Complement class Naive Bayes classifier.<br/>
042: * <br/>
043: * For more information see, <br/>
044: * <br/>
045: * Jason D. Rennie, Lawrence Shih, Jaime Teevan, David R. Karger: Tackling the Poor Assumptions of Naive Bayes Text Classifiers. In: ICML, 616-623, 2003.<br/>
046: * <br/>
047: * P.S.: TF, IDF and length normalization transforms, as described in the paper, can be performed through weka.filters.unsupervised.StringToWordVector.
048: * <p/>
049: <!-- globalinfo-end -->
050: *
051: <!-- technical-bibtex-start -->
052: * BibTeX:
053: * <pre>
054: * @inproceedings{Rennie2003,
055: * author = {Jason D. Rennie and Lawrence Shih and Jaime Teevan and David R. Karger},
056: * booktitle = {ICML},
057: * pages = {616-623},
058: * publisher = {AAAI Press},
059: * title = {Tackling the Poor Assumptions of Naive Bayes Text Classifiers},
060: * year = {2003}
061: * }
062: * </pre>
063: * <p/>
064: <!-- technical-bibtex-end -->
065: *
066: <!-- options-start -->
067: * Valid options are: <p/>
068: *
069: * <pre> -N
070: * Normalize the word weights for each class
071: * </pre>
072: *
073: * <pre> -S
074: * Smoothing value to avoid zero WordGivenClass probabilities (default=1.0).
075: * </pre>
076: *
077: <!-- options-end -->
078: *
079: * @author Ashraf M. Kibriya (amk14@cs.waikato.ac.nz)
080: * @version $Revision: 1.8 $
081: */
082: public class ComplementNaiveBayes extends Classifier implements
083: OptionHandler, WeightedInstancesHandler,
084: TechnicalInformationHandler {
085:
086: /** for serialization */
087: static final long serialVersionUID = 7246302925903086397L;
088:
089: /**
090: Weight of words for each class. The weight is actually the
091: log of the probability of a word (w) given a class (c)
092: (i.e. log(Pr[w|c])). The format of the matrix is:
093: wordWeights[class][wordAttribute]
094: */
095: private double[][] wordWeights;
096:
097: /** Holds the smoothing value to avoid word probabilities of zero.<br>
098: P.S.: According to the paper this is the Alpha i parameter
099: */
100: private double smoothingParameter = 1.0;
101:
102: /** True if the words weights are to be normalized */
103: private boolean m_normalizeWordWeights = false;
104:
105: /** Holds the number of Class values present in the set of specified
106: instances */
107: private int numClasses;
108:
109: /** The instances header that'll be used in toString */
110: private Instances header;
111:
112: /**
113: * Returns an enumeration describing the available options.
114: *
115: * @return an enumeration of all the available options.
116: */
117: public java.util.Enumeration listOptions() {
118: FastVector newVector = new FastVector(2);
119: newVector.addElement(new Option(
120: "\tNormalize the word weights for each class\n", "N",
121: 0, "-N"));
122: newVector.addElement(new Option(
123: "\tSmoothing value to avoid zero WordGivenClass"
124: + " probabilities (default=1.0).\n", "S", 1,
125: "-S"));
126:
127: return newVector.elements();
128: }
129:
130: /**
131: * Gets the current settings of the classifier.
132: *
133: * @return an array of strings suitable for passing to setOptions
134: */
135: public String[] getOptions() {
136: String options[] = new String[4];
137: int current = 0;
138:
139: if (getNormalizeWordWeights())
140: options[current++] = "-N";
141:
142: options[current++] = "-S";
143: options[current++] = Double.toString(smoothingParameter);
144:
145: while (current < options.length) {
146: options[current++] = "";
147: }
148:
149: return options;
150: }
151:
152: /**
153: * Parses a given list of options. <p/>
154: *
155: <!-- options-start -->
156: * Valid options are: <p/>
157: *
158: * <pre> -N
159: * Normalize the word weights for each class
160: * </pre>
161: *
162: * <pre> -S
163: * Smoothing value to avoid zero WordGivenClass probabilities (default=1.0).
164: * </pre>
165: *
166: <!-- options-end -->
167: *
168: * @param options the list of options as an array of strings
169: * @throws Exception if an option is not supported
170: */
171: public void setOptions(String[] options) throws Exception {
172:
173: setNormalizeWordWeights(Utils.getFlag('N', options));
174:
175: String val = Utils.getOption('S', options);
176: if (val.length() != 0)
177: setSmoothingParameter(Double.parseDouble(val));
178: else
179: setSmoothingParameter(1.0);
180: }
181:
182: /**
183: * Returns true if the word weights for each class are to be normalized
184: *
185: * @return true if the word weights are normalized
186: */
187: public boolean getNormalizeWordWeights() {
188: return m_normalizeWordWeights;
189: }
190:
191: /**
192: * Sets whether if the word weights for each class should be normalized
193: *
194: * @param doNormalize whether the word weights are to be normalized
195: */
196: public void setNormalizeWordWeights(boolean doNormalize) {
197: m_normalizeWordWeights = doNormalize;
198: }
199:
200: /**
201: * Returns the tip text for this property
202: * @return tip text for this property suitable for
203: * displaying in the explorer/experimenter gui
204: */
205: public String normalizeWordWeightsTipText() {
206: return "Normalizes the word weights for each class.";
207: }
208:
209: /**
210: * Gets the smoothing value to be used to avoid zero WordGivenClass
211: * probabilities.
212: *
213: * @return the smoothing value
214: */
215: public double getSmoothingParameter() {
216: return smoothingParameter;
217: }
218:
219: /**
220: * Sets the smoothing value used to avoid zero WordGivenClass probabilities
221: *
222: * @param val the new smooting value
223: */
224: public void setSmoothingParameter(double val) {
225: smoothingParameter = val;
226: }
227:
228: /**
229: * Returns the tip text for this property
230: * @return tip text for this property suitable for
231: * displaying in the explorer/experimenter gui
232: */
233: public String smoothingParameterTipText() {
234: return "Sets the smoothing parameter to avoid zero WordGivenClass "
235: + "probabilities (default=1.0).";
236: }
237:
238: /**
239: * Returns a string describing this classifier
240: * @return a description of the classifier suitable for
241: * displaying in the explorer/experimenter gui
242: */
243: public String globalInfo() {
244:
245: return "Class for building and using a Complement class Naive Bayes "
246: + "classifier.\n\nFor more information see, \n\n"
247: + getTechnicalInformation().toString()
248: + "\n\n"
249: + "P.S.: TF, IDF and length normalization transforms, as "
250: + "described in the paper, can be performed through "
251: + "weka.filters.unsupervised.StringToWordVector.";
252: }
253:
254: /**
255: * Returns an instance of a TechnicalInformation object, containing
256: * detailed information about the technical background of this class,
257: * e.g., paper reference or book this class is based on.
258: *
259: * @return the technical information about this class
260: */
261: public TechnicalInformation getTechnicalInformation() {
262: TechnicalInformation result;
263:
264: result = new TechnicalInformation(Type.INPROCEEDINGS);
265: result
266: .setValue(Field.AUTHOR,
267: "Jason D. Rennie and Lawrence Shih and Jaime Teevan and David R. Karger");
268: result
269: .setValue(Field.TITLE,
270: "Tackling the Poor Assumptions of Naive Bayes Text Classifiers");
271: result.setValue(Field.BOOKTITLE, "ICML");
272: result.setValue(Field.YEAR, "2003");
273: result.setValue(Field.PAGES, "616-623");
274: result.setValue(Field.PUBLISHER, "AAAI Press");
275:
276: return result;
277: }
278:
279: /**
280: * Returns default capabilities of the classifier.
281: *
282: * @return the capabilities of this classifier
283: */
284: public Capabilities getCapabilities() {
285: Capabilities result = super .getCapabilities();
286:
287: // attributes
288: result.enable(Capability.NUMERIC_ATTRIBUTES);
289: result.enable(Capability.MISSING_VALUES);
290:
291: // class
292: result.enable(Capability.NOMINAL_CLASS);
293: result.enable(Capability.MISSING_CLASS_VALUES);
294:
295: return result;
296: }
297:
298: /**
299: * Generates the classifier.
300: *
301: * @param instances set of instances serving as training data
302: * @throws Exception if the classifier has not been built successfully
303: */
304: public void buildClassifier(Instances instances) throws Exception {
305:
306: // can classifier handle the data?
307: getCapabilities().testWithFail(instances);
308:
309: // remove instances with missing class
310: instances = new Instances(instances);
311: instances.deleteWithMissingClass();
312:
313: numClasses = instances.numClasses();
314: int numAttributes = instances.numAttributes();
315:
316: header = new Instances(instances, 0);
317: double[][] ocrnceOfWordInClass = new double[numClasses][numAttributes];
318: wordWeights = new double[numClasses][numAttributes];
319: //double [] docsPerClass = new double[numClasses];
320: double[] wordsPerClass = new double[numClasses];
321: double totalWordOccurrences = 0;
322: double sumOfSmoothingParams = (numAttributes - 1)
323: * smoothingParameter;
324: int classIndex = instances.instance(0).classIndex();
325: Instance instance;
326: int docClass;
327: double numOccurrences;
328:
329: java.util.Enumeration enumInsts = instances
330: .enumerateInstances();
331: while (enumInsts.hasMoreElements()) {
332: instance = (Instance) enumInsts.nextElement();
333: docClass = (int) instance.value(classIndex);
334: //docsPerClass[docClass] += instance.weight();
335:
336: for (int a = 0; a < instance.numValues(); a++)
337: if (instance.index(a) != instance.classIndex()) {
338: if (!instance.isMissing(a)) {
339: numOccurrences = instance.valueSparse(a)
340: * instance.weight();
341: if (numOccurrences < 0)
342: throw new Exception("Numeric attribute"
343: + " values must all be greater"
344: + " or equal to zero.");
345: totalWordOccurrences += numOccurrences;
346: wordsPerClass[docClass] += numOccurrences;
347: ocrnceOfWordInClass[docClass][instance.index(a)] += numOccurrences;
348: //For the time being wordweights[0][i]
349: //will hold the total occurrence of word
350: // i over all classes
351: wordWeights[0][instance.index(a)] += numOccurrences;
352: }
353: }
354: }
355:
356: //Calculating the complement class probability for all classes except 0
357: for (int c = 1; c < numClasses; c++) {
358: //total occurrence of words in classes other than c
359: double totalWordOcrnces = totalWordOccurrences
360: - wordsPerClass[c];
361:
362: for (int w = 0; w < numAttributes; w++) {
363: if (w != classIndex) {
364: //occurrence of w in classes other that c
365: double ocrncesOfWord = wordWeights[0][w]
366: - ocrnceOfWordInClass[c][w];
367:
368: wordWeights[c][w] = Math
369: .log((ocrncesOfWord + smoothingParameter)
370: / (totalWordOcrnces + sumOfSmoothingParams));
371: }
372: }
373: }
374:
375: //Now calculating the complement class probability for class 0
376: for (int w = 0; w < numAttributes; w++) {
377: if (w != classIndex) {
378: //occurrence of w in classes other that c
379: double ocrncesOfWord = wordWeights[0][w]
380: - ocrnceOfWordInClass[0][w];
381: //total occurrence of words in classes other than c
382: double totalWordOcrnces = totalWordOccurrences
383: - wordsPerClass[0];
384:
385: wordWeights[0][w] = Math
386: .log((ocrncesOfWord + smoothingParameter)
387: / (totalWordOcrnces + sumOfSmoothingParams));
388: }
389: }
390:
391: //Normalizing weights
392: if (m_normalizeWordWeights == true)
393: for (int c = 0; c < numClasses; c++) {
394: double sum = 0;
395: for (int w = 0; w < numAttributes; w++) {
396: if (w != classIndex)
397: sum += Math.abs(wordWeights[c][w]);
398: }
399: for (int w = 0; w < numAttributes; w++) {
400: if (w != classIndex) {
401: wordWeights[c][w] = wordWeights[c][w] / sum;
402: }
403: }
404: }
405:
406: }
407:
408: /**
409: * Classifies a given instance. <p>
410: *
411: * The classification rule is: <br>
412: * MinC(forAllWords(ti*Wci)) <br>
413: * where <br>
414: * ti is the frequency of word i in the given instance <br>
415: * Wci is the weight of word i in Class c. <p>
416: *
417: * For more information see section 4.4 of the paper mentioned above
418: * in the classifiers description.
419: *
420: * @param instance the instance to classify
421: * @return the index of the class the instance is most likely to belong.
422: * @throws Exception if the classifier has not been built yet.
423: */
424: public double classifyInstance(Instance instance) throws Exception {
425:
426: if (wordWeights == null)
427: throw new Exception(
428: "Error. The classifier has not been built "
429: + "properly.");
430:
431: double[] valueForClass = new double[numClasses];
432: double sumOfClassValues = 0;
433:
434: for (int c = 0; c < numClasses; c++) {
435: double sumOfWordValues = 0;
436: for (int w = 0; w < instance.numValues(); w++) {
437: if (instance.index(w) != instance.classIndex()) {
438: double freqOfWordInDoc = instance.valueSparse(w);
439: sumOfWordValues += freqOfWordInDoc
440: * wordWeights[c][instance.index(w)];
441: }
442: }
443: //valueForClass[c] = Math.log(probOfClass[c]) - sumOfWordValues;
444: valueForClass[c] = sumOfWordValues;
445: sumOfClassValues += valueForClass[c];
446: }
447:
448: int minidx = 0;
449: for (int i = 0; i < numClasses; i++)
450: if (valueForClass[i] < valueForClass[minidx])
451: minidx = i;
452:
453: return minidx;
454: }
455:
456: /**
457: * Prints out the internal model built by the classifier. In this case
458: * it prints out the word weights calculated when building the classifier.
459: */
460: public String toString() {
461: if (wordWeights == null) {
462: return "The classifier hasn't been built yet.";
463: }
464:
465: int numAttributes = header.numAttributes();
466: StringBuffer result = new StringBuffer(
467: "The word weights for each class are: \n"
468: + "------------------------------------\n\t");
469:
470: for (int c = 0; c < numClasses; c++)
471: result.append(header.classAttribute().value(c))
472: .append("\t");
473:
474: result.append("\n");
475:
476: for (int w = 0; w < numAttributes; w++) {
477: result.append(header.attribute(w).name()).append("\t");
478: for (int c = 0; c < numClasses; c++)
479: result.append(Double.toString(wordWeights[c][w]))
480: .append("\t");
481: result.append("\n");
482: }
483:
484: return result.toString();
485: }
486:
487: /**
488: * Main method for testing this class.
489: *
490: * @param argv the options
491: */
492: public static void main(String[] argv) {
493: runClassifier(new ComplementNaiveBayes(), argv);
494: }
495: }
|