001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * BinC45ModelSelection.java
019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.classifiers.trees.j48;
024:
025: import weka.core.Attribute;
026: import weka.core.Instances;
027: import weka.core.Utils;
028:
029: import java.util.Enumeration;
030:
031: /**
032: * Class for selecting a C4.5-like binary (!) split for a given dataset.
033: *
034: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
035: * @version $Revision: 1.10 $
036: */
037: public class BinC45ModelSelection extends ModelSelection {
038:
039: /** for serialization */
040: private static final long serialVersionUID = 179170923545122001L;
041:
042: /** Minimum number of instances in interval. */
043: private int m_minNoObj;
044:
045: /** The FULL training dataset. */
046: private Instances m_allData;
047:
048: /**
049: * Initializes the split selection method with the given parameters.
050: *
051: * @param minNoObj minimum number of instances that have to occur in
052: * at least two subsets induced by split
053: * @param allData FULL training dataset (necessary for selection of
054: * split points).
055: */
056: public BinC45ModelSelection(int minNoObj, Instances allData) {
057: m_minNoObj = minNoObj;
058: m_allData = allData;
059: }
060:
061: /**
062: * Sets reference to training data to null.
063: */
064: public void cleanup() {
065:
066: m_allData = null;
067: }
068:
069: /**
070: * Selects C4.5-type split for the given dataset.
071: */
072: public final ClassifierSplitModel selectModel(Instances data) {
073:
074: double minResult;
075: double currentResult;
076: BinC45Split[] currentModel;
077: BinC45Split bestModel = null;
078: NoSplit noSplitModel = null;
079: double averageInfoGain = 0;
080: int validModels = 0;
081: boolean multiVal = true;
082: Distribution checkDistribution;
083: double sumOfWeights;
084: int i;
085:
086: try {
087:
088: // Check if all Instances belong to one class or if not
089: // enough Instances to split.
090: checkDistribution = new Distribution(data);
091: noSplitModel = new NoSplit(checkDistribution);
092: if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj)
093: || Utils.eq(checkDistribution.total(),
094: checkDistribution
095: .perClass(checkDistribution
096: .maxClass())))
097: return noSplitModel;
098:
099: // Check if all attributes are nominal and have a
100: // lot of values.
101: Enumeration enu = data.enumerateAttributes();
102: while (enu.hasMoreElements()) {
103: Attribute attribute = (Attribute) enu.nextElement();
104: if ((attribute.isNumeric())
105: || (Utils.sm((double) attribute.numValues(),
106: (0.3 * (double) m_allData
107: .numInstances())))) {
108: multiVal = false;
109: break;
110: }
111: }
112: currentModel = new BinC45Split[data.numAttributes()];
113: sumOfWeights = data.sumOfWeights();
114:
115: // For each attribute.
116: for (i = 0; i < data.numAttributes(); i++) {
117:
118: // Apart from class attribute.
119: if (i != (data).classIndex()) {
120:
121: // Get models for current attribute.
122: currentModel[i] = new BinC45Split(i, m_minNoObj,
123: sumOfWeights);
124: currentModel[i].buildClassifier(data);
125:
126: // Check if useful split for current attribute
127: // exists and check for enumerated attributes with
128: // a lot of values.
129: if (currentModel[i].checkModel())
130: if ((data.attribute(i).isNumeric())
131: || (multiVal || Utils.sm((double) data
132: .attribute(i).numValues(),
133: (0.3 * (double) m_allData
134: .numInstances())))) {
135: averageInfoGain = averageInfoGain
136: + currentModel[i].infoGain();
137: validModels++;
138: }
139: } else
140: currentModel[i] = null;
141: }
142:
143: // Check if any useful split was found.
144: if (validModels == 0)
145: return noSplitModel;
146: averageInfoGain = averageInfoGain / (double) validModels;
147:
148: // Find "best" attribute to split on.
149: minResult = 0;
150: for (i = 0; i < data.numAttributes(); i++) {
151: if ((i != (data).classIndex())
152: && (currentModel[i].checkModel()))
153:
154: // Use 1E-3 here to get a closer approximation to the original
155: // implementation.
156: if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
157: && Utils.gr(currentModel[i].gainRatio(),
158: minResult)) {
159: bestModel = currentModel[i];
160: minResult = currentModel[i].gainRatio();
161: }
162: }
163:
164: // Check if useful split was found.
165: if (Utils.eq(minResult, 0))
166: return noSplitModel;
167:
168: // Add all Instances with unknown values for the corresponding
169: // attribute to the distribution for the model, so that
170: // the complete distribution is stored with the model.
171: bestModel.distribution().addInstWithUnknown(data,
172: bestModel.attIndex());
173:
174: // Set the split point analogue to C45 if attribute numeric.
175: bestModel.setSplitPoint(m_allData);
176: return bestModel;
177: } catch (Exception e) {
178: e.printStackTrace();
179: }
180: return null;
181: }
182:
183: /**
184: * Selects C4.5-type split for the given dataset.
185: */
186: public final ClassifierSplitModel selectModel(Instances train,
187: Instances test) {
188:
189: return selectModel(train);
190: }
191: }
|