001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * C45ModelSelection.java
019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.classifiers.trees.j48;
024:
025: import weka.core.Attribute;
026: import weka.core.Instances;
027: import weka.core.Utils;
028:
029: import java.util.Enumeration;
030:
031: /**
032: * Class for selecting a C4.5-type split for a given dataset.
033: *
034: * @author Eibe Frank (eibe@cs.waikato.ac.nz)
035: * @version $Revision: 1.10 $
036: */
037: public class C45ModelSelection extends ModelSelection {
038:
039: /** for serialization */
040: private static final long serialVersionUID = 3372204862440821989L;
041:
042: /** Minimum number of objects in interval. */
043: private int m_minNoObj;
044:
045: /** All the training data */
046: private Instances m_allData; //
047:
048: /**
049: * Initializes the split selection method with the given parameters.
050: *
051: * @param minNoObj minimum number of instances that have to occur in at least two
052: * subsets induced by split
053: * @param allData FULL training dataset (necessary for
054: * selection of split points).
055: */
056: public C45ModelSelection(int minNoObj, Instances allData) {
057: m_minNoObj = minNoObj;
058: m_allData = allData;
059: }
060:
061: /**
062: * Sets reference to training data to null.
063: */
064: public void cleanup() {
065:
066: m_allData = null;
067: }
068:
069: /**
070: * Selects C4.5-type split for the given dataset.
071: */
072: public final ClassifierSplitModel selectModel(Instances data) {
073:
074: double minResult;
075: double currentResult;
076: C45Split[] currentModel;
077: C45Split bestModel = null;
078: NoSplit noSplitModel = null;
079: double averageInfoGain = 0;
080: int validModels = 0;
081: boolean multiVal = true;
082: Distribution checkDistribution;
083: Attribute attribute;
084: double sumOfWeights;
085: int i;
086:
087: try {
088:
089: // Check if all Instances belong to one class or if not
090: // enough Instances to split.
091: checkDistribution = new Distribution(data);
092: noSplitModel = new NoSplit(checkDistribution);
093: if (Utils.sm(checkDistribution.total(), 2 * m_minNoObj)
094: || Utils.eq(checkDistribution.total(),
095: checkDistribution
096: .perClass(checkDistribution
097: .maxClass())))
098: return noSplitModel;
099:
100: // Check if all attributes are nominal and have a
101: // lot of values.
102: if (m_allData != null) {
103: Enumeration enu = data.enumerateAttributes();
104: while (enu.hasMoreElements()) {
105: attribute = (Attribute) enu.nextElement();
106: if ((attribute.isNumeric())
107: || (Utils.sm(
108: (double) attribute.numValues(),
109: (0.3 * (double) m_allData
110: .numInstances())))) {
111: multiVal = false;
112: break;
113: }
114: }
115: }
116:
117: currentModel = new C45Split[data.numAttributes()];
118: sumOfWeights = data.sumOfWeights();
119:
120: // For each attribute.
121: for (i = 0; i < data.numAttributes(); i++) {
122:
123: // Apart from class attribute.
124: if (i != (data).classIndex()) {
125:
126: // Get models for current attribute.
127: currentModel[i] = new C45Split(i, m_minNoObj,
128: sumOfWeights);
129: currentModel[i].buildClassifier(data);
130:
131: // Check if useful split for current attribute
132: // exists and check for enumerated attributes with
133: // a lot of values.
134: if (currentModel[i].checkModel())
135: if (m_allData != null) {
136: if ((data.attribute(i).isNumeric())
137: || (multiVal || Utils.sm(
138: (double) data.attribute(i)
139: .numValues(),
140: (0.3 * (double) m_allData
141: .numInstances())))) {
142: averageInfoGain = averageInfoGain
143: + currentModel[i].infoGain();
144: validModels++;
145: }
146: } else {
147: averageInfoGain = averageInfoGain
148: + currentModel[i].infoGain();
149: validModels++;
150: }
151: } else
152: currentModel[i] = null;
153: }
154:
155: // Check if any useful split was found.
156: if (validModels == 0)
157: return noSplitModel;
158: averageInfoGain = averageInfoGain / (double) validModels;
159:
160: // Find "best" attribute to split on.
161: minResult = 0;
162: for (i = 0; i < data.numAttributes(); i++) {
163: if ((i != (data).classIndex())
164: && (currentModel[i].checkModel()))
165:
166: // Use 1E-3 here to get a closer approximation to the original
167: // implementation.
168: if ((currentModel[i].infoGain() >= (averageInfoGain - 1E-3))
169: && Utils.gr(currentModel[i].gainRatio(),
170: minResult)) {
171: bestModel = currentModel[i];
172: minResult = currentModel[i].gainRatio();
173: }
174: }
175:
176: // Check if useful split was found.
177: if (Utils.eq(minResult, 0))
178: return noSplitModel;
179:
180: // Add all Instances with unknown values for the corresponding
181: // attribute to the distribution for the model, so that
182: // the complete distribution is stored with the model.
183: bestModel.distribution().addInstWithUnknown(data,
184: bestModel.attIndex());
185:
186: // Set the split point analogue to C45 if attribute numeric.
187: if (m_allData != null)
188: bestModel.setSplitPoint(m_allData);
189: return bestModel;
190: } catch (Exception e) {
191: e.printStackTrace();
192: }
193: return null;
194: }
195:
196: /**
197: * Selects C4.5-type split for the given dataset.
198: */
199: public final ClassifierSplitModel selectModel(Instances train,
200: Instances test) {
201:
202: return selectModel(train);
203: }
204: }
|