001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * InfoGainSplitCrit.java
019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.classifiers.trees.j48;
024:
025: import weka.core.Utils;
026:
027: /**
028: * Class for computing the information gain for a given distribution.
029: *
030: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
031: * @version $Revision: 1.9 $
032: */
033: public final class InfoGainSplitCrit extends EntropyBasedSplitCrit {
034:
035: /** for serialization */
036: private static final long serialVersionUID = 4892105020180728499L;
037:
038: /**
039: * This method is a straightforward implementation of the information
040: * gain criterion for the given distribution.
041: */
042: public final double splitCritValue(Distribution bags) {
043:
044: double numerator;
045:
046: numerator = oldEnt(bags) - newEnt(bags);
047:
048: // Splits with no gain are useless.
049: if (Utils.eq(numerator, 0))
050: return Double.MAX_VALUE;
051:
052: // We take the reciprocal value because we want to minimize the
053: // splitting criterion's value.
054: return bags.total() / numerator;
055: }
056:
057: /**
058: * This method computes the information gain in the same way
059: * C4.5 does.
060: *
061: * @param bags the distribution
062: * @param totalNoInst weight of ALL instances (including the
063: * ones with missing values).
064: */
065: public final double splitCritValue(Distribution bags,
066: double totalNoInst) {
067:
068: double numerator;
069: double noUnknown;
070: double unknownRate;
071: int i;
072:
073: noUnknown = totalNoInst - bags.total();
074: unknownRate = noUnknown / totalNoInst;
075: numerator = (oldEnt(bags) - newEnt(bags));
076: numerator = (1 - unknownRate) * numerator;
077:
078: // Splits with no gain are useless.
079: if (Utils.eq(numerator, 0))
080: return 0;
081:
082: return numerator / bags.total();
083: }
084:
085: /**
086: * This method computes the information gain in the same way
087: * C4.5 does.
088: *
089: * @param bags the distribution
090: * @param totalNoInst weight of ALL instances
091: * @param oldEnt entropy with respect to "no-split"-model.
092: */
093: public final double splitCritValue(Distribution bags,
094: double totalNoInst, double oldEnt) {
095:
096: double numerator;
097: double noUnknown;
098: double unknownRate;
099: int i;
100:
101: noUnknown = totalNoInst - bags.total();
102: unknownRate = noUnknown / totalNoInst;
103: numerator = (oldEnt - newEnt(bags));
104: numerator = (1 - unknownRate) * numerator;
105:
106: // Splits with no gain are useless.
107: if (Utils.eq(numerator, 0))
108: return 0;
109:
110: return numerator / bags.total();
111: }
112: }
|