001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * ADNode.java
019: * Copyright (C) 2002 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.classifiers.bayes.net;
024:
025: import weka.core.FastVector;
026: import weka.core.Instance;
027: import weka.core.Instances;
028: import weka.core.TechnicalInformation;
029: import weka.core.TechnicalInformation.Type;
030: import weka.core.TechnicalInformation.Field;
031: import weka.core.TechnicalInformationHandler;
032:
033: import java.io.FileReader;
034: import java.io.Serializable;
035:
036: /**
037: * The ADNode class implements the ADTree datastructure which increases
038: * the speed with which sub-contingency tables can be constructed from
039: * a data set in an Instances object. For details, see: <p/>
040: *
041: <!-- technical-plaintext-start -->
042: * Andrew W. Moore, Mary S. Lee (1998). Cached Sufficient Statistics for Efficient Machine Learning with Large Datasets. Journal of Artificial Intelligence Research. 8:67-91.
043: <!-- technical-plaintext-end -->
044: * <p/>
045: *
046: <!-- technical-bibtex-start -->
047: * BibTeX:
048: * <pre>
049: * @article{Moore1998,
050: * author = {Andrew W. Moore and Mary S. Lee},
051: * journal = {Journal of Artificial Intelligence Research},
052: * pages = {67-91},
053: * title = {Cached Sufficient Statistics for Efficient Machine Learning with Large Datasets},
054: * volume = {8},
055: * year = {1998}
056: * }
057: * </pre>
058: * <p/>
059: <!-- technical-bibtex-end -->
060: *
061: * @author Remco Bouckaert (rrb@xm.co.nz)
062: * @version $Revision: 1.6 $
063: */
064: public class ADNode implements Serializable,
065: TechnicalInformationHandler {
066:
067: /** for serialization */
068: static final long serialVersionUID = 397409728366910204L;
069:
070: final static int MIN_RECORD_SIZE = 0;
071:
072: /** list of VaryNode children **/
073: public VaryNode[] m_VaryNodes;
074: /** list of Instance children (either m_Instances or m_VaryNodes is instantiated) **/
075: public Instance[] m_Instances;
076:
077: /** count **/
078: public int m_nCount;
079:
080: /** first node in VaryNode array **/
081: public int m_nStartNode;
082:
083: /** Creates new ADNode */
084: public ADNode() {
085: }
086:
087: /**
088: * Returns an instance of a TechnicalInformation object, containing
089: * detailed information about the technical background of this class,
090: * e.g., paper reference or book this class is based on.
091: *
092: * @return the technical information about this class
093: */
094: public TechnicalInformation getTechnicalInformation() {
095: TechnicalInformation result;
096:
097: result = new TechnicalInformation(Type.ARTICLE);
098: result
099: .setValue(Field.AUTHOR,
100: "Andrew W. Moore and Mary S. Lee");
101: result.setValue(Field.YEAR, "1998");
102: result
103: .setValue(
104: Field.TITLE,
105: "Cached Sufficient Statistics for Efficient Machine Learning with Large Datasets");
106: result.setValue(Field.JOURNAL,
107: "Journal of Artificial Intelligence Research");
108: result.setValue(Field.VOLUME, "8");
109: result.setValue(Field.PAGES, "67-91");
110:
111: return result;
112: }
113:
114: /** create sub tree
115: * @param iNode index of the lowest node in the tree
116: * @param nRecords set of records in instances to be considered
117: * @param instances data set
118: * @return VaryNode representing part of an ADTree
119: **/
120: public static VaryNode makeVaryNode(int iNode, FastVector nRecords,
121: Instances instances) {
122: VaryNode _VaryNode = new VaryNode(iNode);
123: int nValues = instances.attribute(iNode).numValues();
124:
125: // reserve memory and initialize
126: FastVector[] nChildRecords = new FastVector[nValues];
127: for (int iChild = 0; iChild < nValues; iChild++) {
128: nChildRecords[iChild] = new FastVector();
129: }
130: // divide the records among children
131: for (int iRecord = 0; iRecord < nRecords.size(); iRecord++) {
132: int iInstance = ((Integer) nRecords.elementAt(iRecord))
133: .intValue();
134: nChildRecords[(int) instances.instance(iInstance).value(
135: iNode)].addElement(new Integer(iInstance));
136: }
137:
138: // find most common value
139: int nCount = nChildRecords[0].size();
140: int nMCV = 0;
141: for (int iChild = 1; iChild < nValues; iChild++) {
142: if (nChildRecords[iChild].size() > nCount) {
143: nCount = nChildRecords[iChild].size();
144: nMCV = iChild;
145: }
146: }
147: _VaryNode.m_nMCV = nMCV;
148:
149: // determine child nodes
150: _VaryNode.m_ADNodes = new ADNode[nValues];
151: for (int iChild = 0; iChild < nValues; iChild++) {
152: if (iChild == nMCV || nChildRecords[iChild].size() == 0) {
153: _VaryNode.m_ADNodes[iChild] = null;
154: } else {
155: _VaryNode.m_ADNodes[iChild] = makeADTree(iNode + 1,
156: nChildRecords[iChild], instances);
157: }
158: }
159: return _VaryNode;
160: } // MakeVaryNode
161:
162: /**
163: * create sub tree
164: *
165: * @param iNode index of the lowest node in the tree
166: * @param nRecords set of records in instances to be considered
167: * @param instances data set
168: * @return ADNode representing an ADTree
169: */
170: public static ADNode makeADTree(int iNode, FastVector nRecords,
171: Instances instances) {
172: ADNode _ADNode = new ADNode();
173: _ADNode.m_nCount = nRecords.size();
174: _ADNode.m_nStartNode = iNode;
175: if (nRecords.size() < MIN_RECORD_SIZE) {
176: _ADNode.m_Instances = new Instance[nRecords.size()];
177: for (int iInstance = 0; iInstance < nRecords.size(); iInstance++) {
178: _ADNode.m_Instances[iInstance] = instances
179: .instance(((Integer) nRecords
180: .elementAt(iInstance)).intValue());
181: }
182: } else {
183: _ADNode.m_VaryNodes = new VaryNode[instances
184: .numAttributes()
185: - iNode];
186: for (int iNode2 = iNode; iNode2 < instances.numAttributes(); iNode2++) {
187: _ADNode.m_VaryNodes[iNode2 - iNode] = makeVaryNode(
188: iNode2, nRecords, instances);
189: }
190: }
191: return _ADNode;
192: } // MakeADTree
193:
194: /**
195: * create AD tree from set of instances
196: *
197: * @param instances data set
198: * @return ADNode representing an ADTree
199: */
200: public static ADNode makeADTree(Instances instances) {
201: FastVector nRecords = new FastVector(instances.numInstances());
202: for (int iRecord = 0; iRecord < instances.numInstances(); iRecord++) {
203: nRecords.addElement(new Integer(iRecord));
204: }
205: return makeADTree(0, nRecords, instances);
206: } // MakeADTree
207:
208: /**
209: * get counts for specific instantiation of a set of nodes
210: *
211: * @param nCounts - array for storing counts
212: * @param nNodes - array of node indexes
213: * @param nOffsets - offset for nodes in nNodes in nCounts
214: * @param iNode - index into nNode indicating current node
215: * @param iOffset - Offset into nCounts due to nodes below iNode
216: * @param bSubstract - indicate whether counts should be added or substracted
217: */
218: public void getCounts(int[] nCounts, int[] nNodes, int[] nOffsets,
219: int iNode, int iOffset, boolean bSubstract) {
220: //for (int iNode2 = 0; iNode2 < nCounts.length; iNode2++) {
221: // System.out.print(nCounts[iNode2] + " ");
222: //}
223: //System.out.println();
224: if (iNode >= nNodes.length) {
225: if (bSubstract) {
226: nCounts[iOffset] -= m_nCount;
227: } else {
228: nCounts[iOffset] += m_nCount;
229: }
230: return;
231: } else {
232: if (m_VaryNodes != null) {
233: m_VaryNodes[nNodes[iNode] - m_nStartNode].getCounts(
234: nCounts, nNodes, nOffsets, iNode, iOffset,
235: this , bSubstract);
236: } else {
237: for (int iInstance = 0; iInstance < m_Instances.length; iInstance++) {
238: int iOffset2 = iOffset;
239: Instance instance = m_Instances[iInstance];
240: for (int iNode2 = iNode; iNode2 < nNodes.length; iNode2++) {
241: iOffset2 = iOffset2 + nOffsets[iNode2]
242: * (int) instance.value(nNodes[iNode2]);
243: }
244: if (bSubstract) {
245: nCounts[iOffset2]--;
246: } else {
247: nCounts[iOffset2]++;
248: }
249: }
250: }
251: }
252: } // getCounts
253:
254: /**
255: * print is used for debugging only and shows the ADTree in ASCII graphics
256: */
257: public void print() {
258: String sTab = new String();
259: for (int i = 0; i < m_nStartNode; i++) {
260: sTab = sTab + " ";
261: }
262: System.out.println(sTab + "Count = " + m_nCount);
263: if (m_VaryNodes != null) {
264: for (int iNode = 0; iNode < m_VaryNodes.length; iNode++) {
265: System.out.println(sTab + "Node "
266: + (iNode + m_nStartNode));
267: m_VaryNodes[iNode].print(sTab);
268: }
269: } else {
270: System.out.println(m_Instances);
271: }
272: }
273:
274: /**
275: * for testing only
276: *
277: * @param argv the commandline options
278: */
279: public static void main(String[] argv) {
280: try {
281: Instances instances = new Instances(new FileReader(
282: "\\iris.2.arff"));
283: ADNode ADTree = ADNode.makeADTree(instances);
284: int[] nCounts = new int[12];
285: int[] nNodes = new int[3];
286: int[] nOffsets = new int[3];
287: nNodes[0] = 0;
288: nNodes[1] = 3;
289: nNodes[2] = 4;
290: nOffsets[0] = 2;
291: nOffsets[1] = 1;
292: nOffsets[2] = 4;
293: ADTree.print();
294: ADTree.getCounts(nCounts, nNodes, nOffsets, 0, 0, false);
295:
296: } catch (Throwable t) {
297: t.printStackTrace();
298: }
299: } // main
300:
301: } // class ADNode
|