0001: /*
0002: * This program is free software; you can redistribute it and/or modify
0003: * it under the terms of the GNU General Public License as published by
0004: * the Free Software Foundation; either version 2 of the License, or
0005: * (at your option) any later version.
0006: *
0007: * This program is distributed in the hope that it will be useful,
0008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
0009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
0010: * GNU General Public License for more details.
0011: *
0012: * You should have received a copy of the GNU General Public License
0013: * along with this program; if not, write to the Free Software
0014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
0015: */
0016:
0017: /*
0018: * CheckClassifier.java
0019: * Copyright (C) 1999 University of Waikato, Hamilton, New Zealand
0020: *
0021: */
0022:
0023: package weka.classifiers;
0024:
0025: import weka.core.Attribute;
0026: import weka.core.CheckScheme;
0027: import weka.core.FastVector;
0028: import weka.core.Instance;
0029: import weka.core.Instances;
0030: import weka.core.MultiInstanceCapabilitiesHandler;
0031: import weka.core.Option;
0032: import weka.core.OptionHandler;
0033: import weka.core.SerializationHelper;
0034: import weka.core.TestInstances;
0035: import weka.core.Utils;
0036: import weka.core.WeightedInstancesHandler;
0037:
0038: import java.util.Enumeration;
0039: import java.util.Random;
0040: import java.util.Vector;
0041:
0042: /**
0043: * Class for examining the capabilities and finding problems with
0044: * classifiers. If you implement a classifier using the WEKA.libraries,
0045: * you should run the checks on it to ensure robustness and correct
0046: * operation. Passing all the tests of this object does not mean
0047: * bugs in the classifier don't exist, but this will help find some
0048: * common ones. <p/>
0049: *
0050: * Typical usage: <p/>
0051: * <code>java weka.classifiers.CheckClassifier -W classifier_name
0052: * classifier_options </code><p/>
0053: *
0054: * CheckClassifier reports on the following:
0055: * <ul>
0056: * <li> Classifier abilities
0057: * <ul>
0058: * <li> Possible command line options to the classifier </li>
0059: * <li> Whether the classifier can predict nominal, numeric, string,
0060: * date or relational class attributes. Warnings will be displayed if
0061: * performance is worse than ZeroR </li>
0062: * <li> Whether the classifier can be trained incrementally </li>
0063: * <li> Whether the classifier can handle numeric predictor attributes </li>
0064: * <li> Whether the classifier can handle nominal predictor attributes </li>
0065: * <li> Whether the classifier can handle string predictor attributes </li>
0066: * <li> Whether the classifier can handle date predictor attributes </li>
0067: * <li> Whether the classifier can handle relational predictor attributes </li>
0068: * <li> Whether the classifier can handle multi-instance data </li>
0069: * <li> Whether the classifier can handle missing predictor values </li>
0070: * <li> Whether the classifier can handle missing class values </li>
0071: * <li> Whether a nominal classifier only handles 2 class problems </li>
0072: * <li> Whether the classifier can handle instance weights </li>
0073: * </ul>
0074: * </li>
0075: * <li> Correct functioning
0076: * <ul>
0077: * <li> Correct initialisation during buildClassifier (i.e. no result
0078: * changes when buildClassifier called repeatedly) </li>
0079: * <li> Whether incremental training produces the same results
0080: * as during non-incremental training (which may or may not
0081: * be OK) </li>
0082: * <li> Whether the classifier alters the data pased to it
0083: * (number of instances, instance order, instance weights, etc) </li>
0084: * <li> Whether the toString() method works correctly before the
0085: * classifier has been built. </li>
0086: * </ul>
0087: * </li>
0088: * <li> Degenerate cases
0089: * <ul>
0090: * <li> building classifier with zero training instances </li>
0091: * <li> all but one predictor attribute values missing </li>
0092: * <li> all predictor attribute values missing </li>
0093: * <li> all but one class values missing </li>
0094: * <li> all class values missing </li>
0095: * </ul>
0096: * </li>
0097: * </ul>
0098: * Running CheckClassifier with the debug option set will output the
0099: * training and test datasets for any failed tests.<p/>
0100: *
0101: * The <code>weka.classifiers.AbstractClassifierTest</code> uses this
0102: * class to test all the classifiers. Any changes here, have to be
0103: * checked in that abstract test class, too. <p/>
0104: *
0105: <!-- options-start -->
0106: * Valid options are: <p/>
0107: *
0108: * <pre> -D
0109: * Turn on debugging output.</pre>
0110: *
0111: * <pre> -S
0112: * Silent mode - prints nothing to stdout.</pre>
0113: *
0114: * <pre> -N <num>
0115: * The number of instances in the datasets (default 20).</pre>
0116: *
0117: * <pre> -nominal <num>
0118: * The number of nominal attributes (default 2).</pre>
0119: *
0120: * <pre> -nominal-values <num>
0121: * The number of values for nominal attributes (default 1).</pre>
0122: *
0123: * <pre> -numeric <num>
0124: * The number of numeric attributes (default 1).</pre>
0125: *
0126: * <pre> -string <num>
0127: * The number of string attributes (default 1).</pre>
0128: *
0129: * <pre> -date <num>
0130: * The number of date attributes (default 1).</pre>
0131: *
0132: * <pre> -relational <num>
0133: * The number of relational attributes (default 1).</pre>
0134: *
0135: * <pre> -num-instances-relational <num>
0136: * The number of instances in relational/bag attributes (default 10).</pre>
0137: *
0138: * <pre> -words <comma-separated-list>
0139: * The words to use in string attributes.</pre>
0140: *
0141: * <pre> -word-separators <chars>
0142: * The word separators to use in string attributes.</pre>
0143: *
0144: * <pre> -W
0145: * Full name of the classifier analysed.
0146: * eg: weka.classifiers.bayes.NaiveBayes
0147: * (default weka.classifiers.rules.ZeroR)</pre>
0148: *
0149: * <pre>
0150: * Options specific to classifier weka.classifiers.rules.ZeroR:
0151: * </pre>
0152: *
0153: * <pre> -D
0154: * If set, classifier is run in debug mode and
0155: * may output additional info to the console</pre>
0156: *
0157: <!-- options-end -->
0158: *
0159: * Options after -- are passed to the designated classifier.<p/>
0160: *
0161: * @author Len Trigg (trigg@cs.waikato.ac.nz)
0162: * @author FracPete (fracpete at waikato dot ac dot nz)
0163: * @version $Revision: 1.32 $
0164: * @see TestInstances
0165: */
0166: public class CheckClassifier extends CheckScheme {
0167:
0168: /*
0169: * Note about test methods:
0170: * - methods return array of booleans
0171: * - first index: success or not
0172: * - second index: acceptable or not (e.g., Exception is OK)
0173: * - in case the performance is worse than that of ZeroR both indices are true
0174: *
0175: * FracPete (fracpete at waikato dot ac dot nz)
0176: */
0177:
0178: /*** The classifier to be examined */
0179: protected Classifier m_Classifier = new weka.classifiers.rules.ZeroR();
0180:
0181: /**
0182: * Returns an enumeration describing the available options.
0183: *
0184: * @return an enumeration of all the available options.
0185: */
0186: public Enumeration listOptions() {
0187: Vector result = new Vector();
0188:
0189: Enumeration en = super .listOptions();
0190: while (en.hasMoreElements())
0191: result.addElement(en.nextElement());
0192:
0193: result.addElement(new Option(
0194: "\tFull name of the classifier analysed.\n"
0195: + "\teg: weka.classifiers.bayes.NaiveBayes\n"
0196: + "\t(default weka.classifiers.rules.ZeroR)",
0197: "W", 1, "-W"));
0198:
0199: if ((m_Classifier != null)
0200: && (m_Classifier instanceof OptionHandler)) {
0201: result.addElement(new Option("", "", 0,
0202: "\nOptions specific to classifier "
0203: + m_Classifier.getClass().getName() + ":"));
0204: Enumeration enu = ((OptionHandler) m_Classifier)
0205: .listOptions();
0206: while (enu.hasMoreElements())
0207: result.addElement(enu.nextElement());
0208: }
0209:
0210: return result.elements();
0211: }
0212:
0213: /**
0214: * Parses a given list of options.
0215: *
0216: <!-- options-start -->
0217: * Valid options are: <p/>
0218: *
0219: * <pre> -D
0220: * Turn on debugging output.</pre>
0221: *
0222: * <pre> -S
0223: * Silent mode - prints nothing to stdout.</pre>
0224: *
0225: * <pre> -N <num>
0226: * The number of instances in the datasets (default 20).</pre>
0227: *
0228: * <pre> -nominal <num>
0229: * The number of nominal attributes (default 2).</pre>
0230: *
0231: * <pre> -nominal-values <num>
0232: * The number of values for nominal attributes (default 1).</pre>
0233: *
0234: * <pre> -numeric <num>
0235: * The number of numeric attributes (default 1).</pre>
0236: *
0237: * <pre> -string <num>
0238: * The number of string attributes (default 1).</pre>
0239: *
0240: * <pre> -date <num>
0241: * The number of date attributes (default 1).</pre>
0242: *
0243: * <pre> -relational <num>
0244: * The number of relational attributes (default 1).</pre>
0245: *
0246: * <pre> -num-instances-relational <num>
0247: * The number of instances in relational/bag attributes (default 10).</pre>
0248: *
0249: * <pre> -words <comma-separated-list>
0250: * The words to use in string attributes.</pre>
0251: *
0252: * <pre> -word-separators <chars>
0253: * The word separators to use in string attributes.</pre>
0254: *
0255: * <pre> -W
0256: * Full name of the classifier analysed.
0257: * eg: weka.classifiers.bayes.NaiveBayes
0258: * (default weka.classifiers.rules.ZeroR)</pre>
0259: *
0260: * <pre>
0261: * Options specific to classifier weka.classifiers.rules.ZeroR:
0262: * </pre>
0263: *
0264: * <pre> -D
0265: * If set, classifier is run in debug mode and
0266: * may output additional info to the console</pre>
0267: *
0268: <!-- options-end -->
0269: *
0270: * @param options the list of options as an array of strings
0271: * @throws Exception if an option is not supported
0272: */
0273: public void setOptions(String[] options) throws Exception {
0274: String tmpStr;
0275:
0276: super .setOptions(options);
0277:
0278: tmpStr = Utils.getOption('W', options);
0279: if (tmpStr.length() == 0)
0280: tmpStr = weka.classifiers.rules.ZeroR.class.getName();
0281: setClassifier((Classifier) forName("weka.classifiers",
0282: Classifier.class, tmpStr, Utils
0283: .partitionOptions(options)));
0284: }
0285:
0286: /**
0287: * Gets the current settings of the CheckClassifier.
0288: *
0289: * @return an array of strings suitable for passing to setOptions
0290: */
0291: public String[] getOptions() {
0292: Vector result;
0293: String[] options;
0294: int i;
0295:
0296: result = new Vector();
0297:
0298: options = super .getOptions();
0299: for (i = 0; i < options.length; i++)
0300: result.add(options[i]);
0301:
0302: if (getClassifier() != null) {
0303: result.add("-W");
0304: result.add(getClassifier().getClass().getName());
0305: }
0306:
0307: if ((m_Classifier != null)
0308: && (m_Classifier instanceof OptionHandler))
0309: options = ((OptionHandler) m_Classifier).getOptions();
0310: else
0311: options = new String[0];
0312:
0313: if (options.length > 0) {
0314: result.add("--");
0315: for (i = 0; i < options.length; i++)
0316: result.add(options[i]);
0317: }
0318:
0319: return (String[]) result.toArray(new String[result.size()]);
0320: }
0321:
0322: /**
0323: * Begin the tests, reporting results to System.out
0324: */
0325: public void doTests() {
0326:
0327: if (getClassifier() == null) {
0328: println("\n=== No classifier set ===");
0329: return;
0330: }
0331: println("\n=== Check on Classifier: "
0332: + getClassifier().getClass().getName() + " ===\n");
0333:
0334: // Start tests
0335: m_ClasspathProblems = false;
0336: println("--> Checking for interfaces");
0337: canTakeOptions();
0338: boolean updateableClassifier = updateableClassifier()[0];
0339: boolean weightedInstancesHandler = weightedInstancesHandler()[0];
0340: boolean multiInstanceHandler = multiInstanceHandler()[0];
0341: println("--> Classifier tests");
0342: declaresSerialVersionUID();
0343: testToString();
0344: testsPerClassType(Attribute.NOMINAL, updateableClassifier,
0345: weightedInstancesHandler, multiInstanceHandler);
0346: testsPerClassType(Attribute.NUMERIC, updateableClassifier,
0347: weightedInstancesHandler, multiInstanceHandler);
0348: testsPerClassType(Attribute.DATE, updateableClassifier,
0349: weightedInstancesHandler, multiInstanceHandler);
0350: testsPerClassType(Attribute.STRING, updateableClassifier,
0351: weightedInstancesHandler, multiInstanceHandler);
0352: testsPerClassType(Attribute.RELATIONAL, updateableClassifier,
0353: weightedInstancesHandler, multiInstanceHandler);
0354: }
0355:
0356: /**
0357: * Set the classifier for boosting.
0358: *
0359: * @param newClassifier the Classifier to use.
0360: */
0361: public void setClassifier(Classifier newClassifier) {
0362: m_Classifier = newClassifier;
0363: }
0364:
0365: /**
0366: * Get the classifier used as the classifier
0367: *
0368: * @return the classifier used as the classifier
0369: */
0370: public Classifier getClassifier() {
0371: return m_Classifier;
0372: }
0373:
0374: /**
0375: * Run a battery of tests for a given class attribute type
0376: *
0377: * @param classType true if the class attribute should be numeric
0378: * @param updateable true if the classifier is updateable
0379: * @param weighted true if the classifier says it handles weights
0380: * @param multiInstance true if the classifier is a multi-instance classifier
0381: */
0382: protected void testsPerClassType(int classType, boolean updateable,
0383: boolean weighted, boolean multiInstance) {
0384:
0385: boolean PNom = canPredict(true, false, false, false, false,
0386: multiInstance, classType)[0];
0387: boolean PNum = canPredict(false, true, false, false, false,
0388: multiInstance, classType)[0];
0389: boolean PStr = canPredict(false, false, true, false, false,
0390: multiInstance, classType)[0];
0391: boolean PDat = canPredict(false, false, false, true, false,
0392: multiInstance, classType)[0];
0393: boolean PRel;
0394: if (!multiInstance)
0395: PRel = canPredict(false, false, false, false, true,
0396: multiInstance, classType)[0];
0397: else
0398: PRel = false;
0399:
0400: if (PNom || PNum || PStr || PDat || PRel) {
0401: if (weighted)
0402: instanceWeights(PNom, PNum, PStr, PDat, PRel,
0403: multiInstance, classType);
0404:
0405: canHandleOnlyClass(PNom, PNum, PStr, PDat, PRel, classType);
0406:
0407: if (classType == Attribute.NOMINAL)
0408: canHandleNClasses(PNom, PNum, PStr, PDat, PRel,
0409: multiInstance, 4);
0410:
0411: if (!multiInstance) {
0412: canHandleClassAsNthAttribute(PNom, PNum, PStr, PDat,
0413: PRel, multiInstance, classType, 0);
0414: canHandleClassAsNthAttribute(PNom, PNum, PStr, PDat,
0415: PRel, multiInstance, classType, 1);
0416: }
0417:
0418: canHandleZeroTraining(PNom, PNum, PStr, PDat, PRel,
0419: multiInstance, classType);
0420: boolean handleMissingPredictors = canHandleMissing(PNom,
0421: PNum, PStr, PDat, PRel, multiInstance, classType,
0422: true, false, 20)[0];
0423: if (handleMissingPredictors)
0424: canHandleMissing(PNom, PNum, PStr, PDat, PRel,
0425: multiInstance, classType, true, false, 100);
0426:
0427: boolean handleMissingClass = canHandleMissing(PNom, PNum,
0428: PStr, PDat, PRel, multiInstance, classType, false,
0429: true, 20)[0];
0430: if (handleMissingClass)
0431: canHandleMissing(PNom, PNum, PStr, PDat, PRel,
0432: multiInstance, classType, false, true, 100);
0433:
0434: correctBuildInitialisation(PNom, PNum, PStr, PDat, PRel,
0435: multiInstance, classType);
0436: datasetIntegrity(PNom, PNum, PStr, PDat, PRel,
0437: multiInstance, classType, handleMissingPredictors,
0438: handleMissingClass);
0439: doesntUseTestClassVal(PNom, PNum, PStr, PDat, PRel,
0440: multiInstance, classType);
0441: if (updateable)
0442: updatingEquality(PNom, PNum, PStr, PDat, PRel,
0443: multiInstance, classType);
0444: }
0445: }
0446:
0447: /**
0448: * Checks whether the scheme's toString() method works even though the
0449: * classifies hasn't been built yet.
0450: *
0451: * @return index 0 is true if the toString() method works fine
0452: */
0453: protected boolean[] testToString() {
0454: boolean[] result = new boolean[2];
0455:
0456: print("toString...");
0457:
0458: try {
0459: Classifier copy = (Classifier) m_Classifier.getClass()
0460: .newInstance();
0461: copy.toString();
0462: result[0] = true;
0463: println("yes");
0464: } catch (Exception e) {
0465: result[0] = false;
0466: println("no");
0467: if (m_Debug) {
0468: println("\n=== Full report ===");
0469: e.printStackTrace();
0470: println("\n");
0471: }
0472: }
0473:
0474: return result;
0475: }
0476:
0477: /**
0478: * tests for a serialVersionUID. Fails in case the scheme doesn't declare
0479: * a UID.
0480: *
0481: * @return index 0 is true if the scheme declares a UID
0482: */
0483: protected boolean[] declaresSerialVersionUID() {
0484: boolean[] result = new boolean[2];
0485:
0486: print("serialVersionUID...");
0487:
0488: result[0] = !SerializationHelper.needsUID(m_Classifier
0489: .getClass());
0490:
0491: if (result[0])
0492: println("yes");
0493: else
0494: println("no");
0495:
0496: return result;
0497: }
0498:
0499: /**
0500: * Checks whether the scheme can take command line options.
0501: *
0502: * @return index 0 is true if the classifier can take options
0503: */
0504: protected boolean[] canTakeOptions() {
0505:
0506: boolean[] result = new boolean[2];
0507:
0508: print("options...");
0509: if (m_Classifier instanceof OptionHandler) {
0510: println("yes");
0511: if (m_Debug) {
0512: println("\n=== Full report ===");
0513: Enumeration enu = ((OptionHandler) m_Classifier)
0514: .listOptions();
0515: while (enu.hasMoreElements()) {
0516: Option option = (Option) enu.nextElement();
0517: print(option.synopsis() + "\n"
0518: + option.description() + "\n");
0519: }
0520: println("\n");
0521: }
0522: result[0] = true;
0523: } else {
0524: println("no");
0525: result[0] = false;
0526: }
0527:
0528: return result;
0529: }
0530:
0531: /**
0532: * Checks whether the scheme can build models incrementally.
0533: *
0534: * @return index 0 is true if the classifier can train incrementally
0535: */
0536: protected boolean[] updateableClassifier() {
0537:
0538: boolean[] result = new boolean[2];
0539:
0540: print("updateable classifier...");
0541: if (m_Classifier instanceof UpdateableClassifier) {
0542: println("yes");
0543: result[0] = true;
0544: } else {
0545: println("no");
0546: result[0] = false;
0547: }
0548:
0549: return result;
0550: }
0551:
0552: /**
0553: * Checks whether the scheme says it can handle instance weights.
0554: *
0555: * @return true if the classifier handles instance weights
0556: */
0557: protected boolean[] weightedInstancesHandler() {
0558:
0559: boolean[] result = new boolean[2];
0560:
0561: print("weighted instances classifier...");
0562: if (m_Classifier instanceof WeightedInstancesHandler) {
0563: println("yes");
0564: result[0] = true;
0565: } else {
0566: println("no");
0567: result[0] = false;
0568: }
0569:
0570: return result;
0571: }
0572:
0573: /**
0574: * Checks whether the scheme handles multi-instance data.
0575: *
0576: * @return true if the classifier handles multi-instance data
0577: */
0578: protected boolean[] multiInstanceHandler() {
0579: boolean[] result = new boolean[2];
0580:
0581: print("multi-instance classifier...");
0582: if (m_Classifier instanceof MultiInstanceCapabilitiesHandler) {
0583: println("yes");
0584: result[0] = true;
0585: } else {
0586: println("no");
0587: result[0] = false;
0588: }
0589:
0590: return result;
0591: }
0592:
0593: /**
0594: * Checks basic prediction of the scheme, for simple non-troublesome
0595: * datasets.
0596: *
0597: * @param nominalPredictor if true use nominal predictor attributes
0598: * @param numericPredictor if true use numeric predictor attributes
0599: * @param stringPredictor if true use string predictor attributes
0600: * @param datePredictor if true use date predictor attributes
0601: * @param relationalPredictor if true use relational predictor attributes
0602: * @param multiInstance whether multi-instance is needed
0603: * @param classType the class type (NOMINAL, NUMERIC, etc.)
0604: * @return index 0 is true if the test was passed, index 1 is true if test
0605: * was acceptable
0606: */
0607: protected boolean[] canPredict(boolean nominalPredictor,
0608: boolean numericPredictor, boolean stringPredictor,
0609: boolean datePredictor, boolean relationalPredictor,
0610: boolean multiInstance, int classType) {
0611:
0612: print("basic predict");
0613: printAttributeSummary(nominalPredictor, numericPredictor,
0614: stringPredictor, datePredictor, relationalPredictor,
0615: multiInstance, classType);
0616: print("...");
0617: FastVector accepts = new FastVector();
0618: accepts.addElement("unary");
0619: accepts.addElement("binary");
0620: accepts.addElement("nominal");
0621: accepts.addElement("numeric");
0622: accepts.addElement("string");
0623: accepts.addElement("date");
0624: accepts.addElement("relational");
0625: accepts.addElement("multi-instance");
0626: accepts.addElement("not in classpath");
0627: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
0628: boolean predictorMissing = false, classMissing = false;
0629:
0630: return runBasicTest(nominalPredictor, numericPredictor,
0631: stringPredictor, datePredictor, relationalPredictor,
0632: multiInstance, classType, missingLevel,
0633: predictorMissing, classMissing, numTrain, numTest,
0634: numClasses, accepts);
0635: }
0636:
0637: /**
0638: * Checks whether the scheme can handle data that contains only the class
0639: * attribute. If a scheme cannot build a proper model with that data, it
0640: * should default back to a ZeroR model.
0641: *
0642: * @param nominalPredictor if true use nominal predictor attributes
0643: * @param numericPredictor if true use numeric predictor attributes
0644: * @param stringPredictor if true use string predictor attributes
0645: * @param datePredictor if true use date predictor attributes
0646: * @param relationalPredictor if true use relational predictor attributes
0647: * @param classType the class type (NOMINAL, NUMERIC, etc.)
0648: * @return index 0 is true if the test was passed, index 1 is true if test
0649: * was acceptable
0650: */
0651: protected boolean[] canHandleOnlyClass(boolean nominalPredictor,
0652: boolean numericPredictor, boolean stringPredictor,
0653: boolean datePredictor, boolean relationalPredictor,
0654: int classType) {
0655:
0656: print("only class in data");
0657: printAttributeSummary(nominalPredictor, numericPredictor,
0658: stringPredictor, datePredictor, relationalPredictor,
0659: false, classType);
0660: print("...");
0661: FastVector accepts = new FastVector();
0662: accepts.addElement("class");
0663: accepts.addElement("zeror");
0664: int numTrain = getNumInstances(), numTest = getNumInstances(), missingLevel = 0;
0665: boolean predictorMissing = false, classMissing = false;
0666:
0667: return runBasicTest(false, false, false, false, false, false,
0668: classType, missingLevel, predictorMissing,
0669: classMissing, numTrain, numTest, 2, accepts);
0670: }
0671:
0672: /**
0673: * Checks whether nominal schemes can handle more than two classes.
0674: * If a scheme is only designed for two-class problems it should
0675: * throw an appropriate exception for multi-class problems.
0676: *
0677: * @param nominalPredictor if true use nominal predictor attributes
0678: * @param numericPredictor if true use numeric predictor attributes
0679: * @param stringPredictor if true use string predictor attributes
0680: * @param datePredictor if true use date predictor attributes
0681: * @param relationalPredictor if true use relational predictor attributes
0682: * @param multiInstance whether multi-instance is needed
0683: * @param numClasses the number of classes to test
0684: * @return index 0 is true if the test was passed, index 1 is true if test
0685: * was acceptable
0686: */
0687: protected boolean[] canHandleNClasses(boolean nominalPredictor,
0688: boolean numericPredictor, boolean stringPredictor,
0689: boolean datePredictor, boolean relationalPredictor,
0690: boolean multiInstance, int numClasses) {
0691:
0692: print("more than two class problems");
0693: printAttributeSummary(nominalPredictor, numericPredictor,
0694: stringPredictor, datePredictor, relationalPredictor,
0695: multiInstance, Attribute.NOMINAL);
0696: print("...");
0697: FastVector accepts = new FastVector();
0698: accepts.addElement("number");
0699: accepts.addElement("class");
0700: int numTrain = getNumInstances(), numTest = getNumInstances(), missingLevel = 0;
0701: boolean predictorMissing = false, classMissing = false;
0702:
0703: return runBasicTest(nominalPredictor, numericPredictor,
0704: stringPredictor, datePredictor, relationalPredictor,
0705: multiInstance, Attribute.NOMINAL, missingLevel,
0706: predictorMissing, classMissing, numTrain, numTest,
0707: numClasses, accepts);
0708: }
0709:
0710: /**
0711: * Checks whether the scheme can handle class attributes as Nth attribute.
0712: *
0713: * @param nominalPredictor if true use nominal predictor attributes
0714: * @param numericPredictor if true use numeric predictor attributes
0715: * @param stringPredictor if true use string predictor attributes
0716: * @param datePredictor if true use date predictor attributes
0717: * @param relationalPredictor if true use relational predictor attributes
0718: * @param multiInstance whether multi-instance is needed
0719: * @param classType the class type (NUMERIC, NOMINAL, etc.)
0720: * @param classIndex the index of the class attribute (0-based, -1 means last attribute)
0721: * @return index 0 is true if the test was passed, index 1 is true if test
0722: * was acceptable
0723: * @see TestInstances#CLASS_IS_LAST
0724: */
0725: protected boolean[] canHandleClassAsNthAttribute(
0726: boolean nominalPredictor, boolean numericPredictor,
0727: boolean stringPredictor, boolean datePredictor,
0728: boolean relationalPredictor, boolean multiInstance,
0729: int classType, int classIndex) {
0730:
0731: if (classIndex == TestInstances.CLASS_IS_LAST)
0732: print("class attribute as last attribute");
0733: else
0734: print("class attribute as " + (classIndex + 1)
0735: + ". attribute");
0736: printAttributeSummary(nominalPredictor, numericPredictor,
0737: stringPredictor, datePredictor, relationalPredictor,
0738: multiInstance, classType);
0739: print("...");
0740: FastVector accepts = new FastVector();
0741: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
0742: boolean predictorMissing = false, classMissing = false;
0743:
0744: return runBasicTest(nominalPredictor, numericPredictor,
0745: stringPredictor, datePredictor, relationalPredictor,
0746: multiInstance, classType, classIndex, missingLevel,
0747: predictorMissing, classMissing, numTrain, numTest,
0748: numClasses, accepts);
0749: }
0750:
0751: /**
0752: * Checks whether the scheme can handle zero training instances.
0753: *
0754: * @param nominalPredictor if true use nominal predictor attributes
0755: * @param numericPredictor if true use numeric predictor attributes
0756: * @param stringPredictor if true use string predictor attributes
0757: * @param datePredictor if true use date predictor attributes
0758: * @param relationalPredictor if true use relational predictor attributes
0759: * @param multiInstance whether multi-instance is needed
0760: * @param classType the class type (NUMERIC, NOMINAL, etc.)
0761: * @return index 0 is true if the test was passed, index 1 is true if test
0762: * was acceptable
0763: */
0764: protected boolean[] canHandleZeroTraining(boolean nominalPredictor,
0765: boolean numericPredictor, boolean stringPredictor,
0766: boolean datePredictor, boolean relationalPredictor,
0767: boolean multiInstance, int classType) {
0768:
0769: print("handle zero training instances");
0770: printAttributeSummary(nominalPredictor, numericPredictor,
0771: stringPredictor, datePredictor, relationalPredictor,
0772: multiInstance, classType);
0773: print("...");
0774: FastVector accepts = new FastVector();
0775: accepts.addElement("train");
0776: accepts.addElement("value");
0777: int numTrain = 0, numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
0778: boolean predictorMissing = false, classMissing = false;
0779:
0780: return runBasicTest(nominalPredictor, numericPredictor,
0781: stringPredictor, datePredictor, relationalPredictor,
0782: multiInstance, classType, missingLevel,
0783: predictorMissing, classMissing, numTrain, numTest,
0784: numClasses, accepts);
0785: }
0786:
0787: /**
0788: * Checks whether the scheme correctly initialises models when
0789: * buildClassifier is called. This test calls buildClassifier with
0790: * one training dataset and records performance on a test set.
0791: * buildClassifier is then called on a training set with different
0792: * structure, and then again with the original training set. The
0793: * performance on the test set is compared with the original results
0794: * and any performance difference noted as incorrect build initialisation.
0795: *
0796: * @param nominalPredictor if true use nominal predictor attributes
0797: * @param numericPredictor if true use numeric predictor attributes
0798: * @param stringPredictor if true use string predictor attributes
0799: * @param datePredictor if true use date predictor attributes
0800: * @param relationalPredictor if true use relational predictor attributes
0801: * @param multiInstance whether multi-instance is needed
0802: * @param classType the class type (NUMERIC, NOMINAL, etc.)
0803: * @return index 0 is true if the test was passed, index 1 is true if the
0804: * scheme performs worse than ZeroR, but without error (index 0 is
0805: * false)
0806: */
0807: protected boolean[] correctBuildInitialisation(
0808: boolean nominalPredictor, boolean numericPredictor,
0809: boolean stringPredictor, boolean datePredictor,
0810: boolean relationalPredictor, boolean multiInstance,
0811: int classType) {
0812:
0813: boolean[] result = new boolean[2];
0814:
0815: print("correct initialisation during buildClassifier");
0816: printAttributeSummary(nominalPredictor, numericPredictor,
0817: stringPredictor, datePredictor, relationalPredictor,
0818: multiInstance, classType);
0819: print("...");
0820: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
0821: boolean predictorMissing = false, classMissing = false;
0822:
0823: Instances train1 = null;
0824: Instances test1 = null;
0825: Instances train2 = null;
0826: Instances test2 = null;
0827: Classifier classifier = null;
0828: Evaluation evaluation1A = null;
0829: Evaluation evaluation1B = null;
0830: Evaluation evaluation2 = null;
0831: boolean built = false;
0832: int stage = 0;
0833: try {
0834:
0835: // Make two sets of train/test splits with different
0836: // numbers of attributes
0837: train1 = makeTestDataset(42, numTrain,
0838: nominalPredictor ? getNumNominal() : 0,
0839: numericPredictor ? getNumNumeric() : 0,
0840: stringPredictor ? getNumString() : 0,
0841: datePredictor ? getNumDate() : 0,
0842: relationalPredictor ? getNumRelational() : 0,
0843: numClasses, classType, multiInstance);
0844: train2 = makeTestDataset(84, numTrain,
0845: nominalPredictor ? getNumNominal() + 1 : 0,
0846: numericPredictor ? getNumNumeric() + 1 : 0,
0847: stringPredictor ? getNumString() : 0,
0848: datePredictor ? getNumDate() : 0,
0849: relationalPredictor ? getNumRelational() : 0,
0850: numClasses, classType, multiInstance);
0851: test1 = makeTestDataset(24, numTest,
0852: nominalPredictor ? getNumNominal() : 0,
0853: numericPredictor ? getNumNumeric() : 0,
0854: stringPredictor ? getNumString() : 0,
0855: datePredictor ? getNumDate() : 0,
0856: relationalPredictor ? getNumRelational() : 0,
0857: numClasses, classType, multiInstance);
0858: test2 = makeTestDataset(48, numTest,
0859: nominalPredictor ? getNumNominal() + 1 : 0,
0860: numericPredictor ? getNumNumeric() + 1 : 0,
0861: stringPredictor ? getNumString() : 0,
0862: datePredictor ? getNumDate() : 0,
0863: relationalPredictor ? getNumRelational() : 0,
0864: numClasses, classType, multiInstance);
0865: if (missingLevel > 0) {
0866: addMissing(train1, missingLevel, predictorMissing,
0867: classMissing);
0868: addMissing(test1, Math.min(missingLevel, 50),
0869: predictorMissing, classMissing);
0870: addMissing(train2, missingLevel, predictorMissing,
0871: classMissing);
0872: addMissing(test2, Math.min(missingLevel, 50),
0873: predictorMissing, classMissing);
0874: }
0875:
0876: classifier = Classifier.makeCopies(getClassifier(), 1)[0];
0877: evaluation1A = new Evaluation(train1);
0878: evaluation1B = new Evaluation(train1);
0879: evaluation2 = new Evaluation(train2);
0880: } catch (Exception ex) {
0881: throw new Error("Error setting up for tests: "
0882: + ex.getMessage());
0883: }
0884: try {
0885: stage = 0;
0886: classifier.buildClassifier(train1);
0887: built = true;
0888: if (!testWRTZeroR(classifier, evaluation1A, train1, test1)[0]) {
0889: throw new Exception("Scheme performs worse than ZeroR");
0890: }
0891:
0892: stage = 1;
0893: built = false;
0894: classifier.buildClassifier(train2);
0895: built = true;
0896: if (!testWRTZeroR(classifier, evaluation2, train2, test2)[0]) {
0897: throw new Exception("Scheme performs worse than ZeroR");
0898: }
0899:
0900: stage = 2;
0901: built = false;
0902: classifier.buildClassifier(train1);
0903: built = true;
0904: if (!testWRTZeroR(classifier, evaluation1B, train1, test1)[0]) {
0905: throw new Exception("Scheme performs worse than ZeroR");
0906: }
0907:
0908: stage = 3;
0909: if (!evaluation1A.equals(evaluation1B)) {
0910: if (m_Debug) {
0911: println("\n=== Full report ===\n"
0912: + evaluation1A.toSummaryString(
0913: "\nFirst buildClassifier()", true)
0914: + "\n\n");
0915: println(evaluation1B.toSummaryString(
0916: "\nSecond buildClassifier()", true)
0917: + "\n\n");
0918: }
0919: throw new Exception(
0920: "Results differ between buildClassifier calls");
0921: }
0922: println("yes");
0923: result[0] = true;
0924:
0925: if (false && m_Debug) {
0926: println("\n=== Full report ===\n"
0927: + evaluation1A.toSummaryString(
0928: "\nFirst buildClassifier()", true)
0929: + "\n\n");
0930: println(evaluation1B.toSummaryString(
0931: "\nSecond buildClassifier()", true)
0932: + "\n\n");
0933: }
0934: } catch (Exception ex) {
0935: String msg = ex.getMessage().toLowerCase();
0936: if (msg.indexOf("worse than zeror") >= 0) {
0937: println("warning: performs worse than ZeroR");
0938: result[0] = (stage < 1);
0939: result[1] = (stage < 1);
0940: } else {
0941: println("no");
0942: result[0] = false;
0943: }
0944: if (m_Debug) {
0945: println("\n=== Full Report ===");
0946: print("Problem during");
0947: if (built) {
0948: print(" testing");
0949: } else {
0950: print(" training");
0951: }
0952: switch (stage) {
0953: case 0:
0954: print(" of dataset 1");
0955: break;
0956: case 1:
0957: print(" of dataset 2");
0958: break;
0959: case 2:
0960: print(" of dataset 1 (2nd build)");
0961: break;
0962: case 3:
0963: print(", comparing results from builds of dataset 1");
0964: break;
0965: }
0966: println(": " + ex.getMessage() + "\n");
0967: println("here are the datasets:\n");
0968: println("=== Train1 Dataset ===\n" + train1.toString()
0969: + "\n");
0970: println("=== Test1 Dataset ===\n" + test1.toString()
0971: + "\n\n");
0972: println("=== Train2 Dataset ===\n" + train2.toString()
0973: + "\n");
0974: println("=== Test2 Dataset ===\n" + test2.toString()
0975: + "\n\n");
0976: }
0977: }
0978:
0979: return result;
0980: }
0981:
0982: /**
0983: * Checks basic missing value handling of the scheme. If the missing
0984: * values cause an exception to be thrown by the scheme, this will be
0985: * recorded.
0986: *
0987: * @param nominalPredictor if true use nominal predictor attributes
0988: * @param numericPredictor if true use numeric predictor attributes
0989: * @param stringPredictor if true use string predictor attributes
0990: * @param datePredictor if true use date predictor attributes
0991: * @param relationalPredictor if true use relational predictor attributes
0992: * @param multiInstance whether multi-instance is needed
0993: * @param classType the class type (NUMERIC, NOMINAL, etc.)
0994: * @param predictorMissing true if the missing values may be in
0995: * the predictors
0996: * @param classMissing true if the missing values may be in the class
0997: * @param missingLevel the percentage of missing values
0998: * @return index 0 is true if the test was passed, index 1 is true if test
0999: * was acceptable
1000: */
1001: protected boolean[] canHandleMissing(boolean nominalPredictor,
1002: boolean numericPredictor, boolean stringPredictor,
1003: boolean datePredictor, boolean relationalPredictor,
1004: boolean multiInstance, int classType,
1005: boolean predictorMissing, boolean classMissing,
1006: int missingLevel) {
1007:
1008: if (missingLevel == 100)
1009: print("100% ");
1010: print("missing");
1011: if (predictorMissing) {
1012: print(" predictor");
1013: if (classMissing)
1014: print(" and");
1015: }
1016: if (classMissing)
1017: print(" class");
1018: print(" values");
1019: printAttributeSummary(nominalPredictor, numericPredictor,
1020: stringPredictor, datePredictor, relationalPredictor,
1021: multiInstance, classType);
1022: print("...");
1023: FastVector accepts = new FastVector();
1024: accepts.addElement("missing");
1025: accepts.addElement("value");
1026: accepts.addElement("train");
1027: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2;
1028:
1029: return runBasicTest(nominalPredictor, numericPredictor,
1030: stringPredictor, datePredictor, relationalPredictor,
1031: multiInstance, classType, missingLevel,
1032: predictorMissing, classMissing, numTrain, numTest,
1033: numClasses, accepts);
1034: }
1035:
1036: /**
1037: * Checks whether an updateable scheme produces the same model when
1038: * trained incrementally as when batch trained. The model itself
1039: * cannot be compared, so we compare the evaluation on test data
1040: * for both models. It is possible to get a false positive on this
1041: * test (likelihood depends on the classifier).
1042: *
1043: * @param nominalPredictor if true use nominal predictor attributes
1044: * @param numericPredictor if true use numeric predictor attributes
1045: * @param stringPredictor if true use string predictor attributes
1046: * @param datePredictor if true use date predictor attributes
1047: * @param relationalPredictor if true use relational predictor attributes
1048: * @param multiInstance whether multi-instance is needed
1049: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1050: * @return index 0 is true if the test was passed
1051: */
1052: protected boolean[] updatingEquality(boolean nominalPredictor,
1053: boolean numericPredictor, boolean stringPredictor,
1054: boolean datePredictor, boolean relationalPredictor,
1055: boolean multiInstance, int classType) {
1056:
1057: print("incremental training produces the same results"
1058: + " as batch training");
1059: printAttributeSummary(nominalPredictor, numericPredictor,
1060: stringPredictor, datePredictor, relationalPredictor,
1061: multiInstance, classType);
1062: print("...");
1063: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
1064: boolean predictorMissing = false, classMissing = false;
1065:
1066: boolean[] result = new boolean[2];
1067: Instances train = null;
1068: Instances test = null;
1069: Classifier[] classifiers = null;
1070: Evaluation evaluationB = null;
1071: Evaluation evaluationI = null;
1072: boolean built = false;
1073: try {
1074: train = makeTestDataset(42, numTrain,
1075: nominalPredictor ? getNumNominal() : 0,
1076: numericPredictor ? getNumNumeric() : 0,
1077: stringPredictor ? getNumString() : 0,
1078: datePredictor ? getNumDate() : 0,
1079: relationalPredictor ? getNumRelational() : 0,
1080: numClasses, classType, multiInstance);
1081: test = makeTestDataset(24, numTest,
1082: nominalPredictor ? getNumNominal() : 0,
1083: numericPredictor ? getNumNumeric() : 0,
1084: stringPredictor ? getNumString() : 0,
1085: datePredictor ? getNumDate() : 0,
1086: relationalPredictor ? getNumRelational() : 0,
1087: numClasses, classType, multiInstance);
1088: if (missingLevel > 0) {
1089: addMissing(train, missingLevel, predictorMissing,
1090: classMissing);
1091: addMissing(test, Math.min(missingLevel, 50),
1092: predictorMissing, classMissing);
1093: }
1094: classifiers = Classifier.makeCopies(getClassifier(), 2);
1095: evaluationB = new Evaluation(train);
1096: evaluationI = new Evaluation(train);
1097: classifiers[0].buildClassifier(train);
1098: testWRTZeroR(classifiers[0], evaluationB, train, test);
1099: } catch (Exception ex) {
1100: throw new Error("Error setting up for tests: "
1101: + ex.getMessage());
1102: }
1103: try {
1104: classifiers[1].buildClassifier(new Instances(train, 0));
1105: for (int i = 0; i < train.numInstances(); i++) {
1106: ((UpdateableClassifier) classifiers[1])
1107: .updateClassifier(train.instance(i));
1108: }
1109: built = true;
1110: testWRTZeroR(classifiers[1], evaluationI, train, test);
1111: if (!evaluationB.equals(evaluationI)) {
1112: println("no");
1113: result[0] = false;
1114:
1115: if (m_Debug) {
1116: println("\n=== Full Report ===");
1117: println("Results differ between batch and "
1118: + "incrementally built models.\n"
1119: + "Depending on the classifier, this may be OK");
1120: println("Here are the results:\n");
1121: println(evaluationB.toSummaryString(
1122: "\nbatch built results\n", true));
1123: println(evaluationI.toSummaryString(
1124: "\nincrementally built results\n", true));
1125: println("Here are the datasets:\n");
1126: println("=== Train Dataset ===\n"
1127: + train.toString() + "\n");
1128: println("=== Test Dataset ===\n" + test.toString()
1129: + "\n\n");
1130: }
1131: } else {
1132: println("yes");
1133: result[0] = true;
1134: }
1135: } catch (Exception ex) {
1136: result[0] = false;
1137:
1138: print("Problem during");
1139: if (built)
1140: print(" testing");
1141: else
1142: print(" training");
1143: println(": " + ex.getMessage() + "\n");
1144: }
1145:
1146: return result;
1147: }
1148:
1149: /**
1150: * Checks whether the classifier erroneously uses the class
1151: * value of test instances (if provided). Runs the classifier with
1152: * test instance class values set to missing and compares with results
1153: * when test instance class values are left intact.
1154: *
1155: * @param nominalPredictor if true use nominal predictor attributes
1156: * @param numericPredictor if true use numeric predictor attributes
1157: * @param stringPredictor if true use string predictor attributes
1158: * @param datePredictor if true use date predictor attributes
1159: * @param relationalPredictor if true use relational predictor attributes
1160: * @param multiInstance whether multi-instance is needed
1161: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1162: * @return index 0 is true if the test was passed
1163: */
1164: protected boolean[] doesntUseTestClassVal(boolean nominalPredictor,
1165: boolean numericPredictor, boolean stringPredictor,
1166: boolean datePredictor, boolean relationalPredictor,
1167: boolean multiInstance, int classType) {
1168:
1169: print("classifier ignores test instance class vals");
1170: printAttributeSummary(nominalPredictor, numericPredictor,
1171: stringPredictor, datePredictor, relationalPredictor,
1172: multiInstance, classType);
1173: print("...");
1174: int numTrain = 2 * getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
1175: boolean predictorMissing = false, classMissing = false;
1176:
1177: boolean[] result = new boolean[2];
1178: Instances train = null;
1179: Instances test = null;
1180: Classifier[] classifiers = null;
1181: boolean evalFail = false;
1182: try {
1183: train = makeTestDataset(42, numTrain,
1184: nominalPredictor ? getNumNominal() + 1 : 0,
1185: numericPredictor ? getNumNumeric() + 1 : 0,
1186: stringPredictor ? getNumString() : 0,
1187: datePredictor ? getNumDate() : 0,
1188: relationalPredictor ? getNumRelational() : 0,
1189: numClasses, classType, multiInstance);
1190: test = makeTestDataset(24, numTest,
1191: nominalPredictor ? getNumNominal() + 1 : 0,
1192: numericPredictor ? getNumNumeric() + 1 : 0,
1193: stringPredictor ? getNumString() : 0,
1194: datePredictor ? getNumDate() : 0,
1195: relationalPredictor ? getNumRelational() : 0,
1196: numClasses, classType, multiInstance);
1197: if (missingLevel > 0) {
1198: addMissing(train, missingLevel, predictorMissing,
1199: classMissing);
1200: addMissing(test, Math.min(missingLevel, 50),
1201: predictorMissing, classMissing);
1202: }
1203: classifiers = Classifier.makeCopies(getClassifier(), 2);
1204: classifiers[0].buildClassifier(train);
1205: classifiers[1].buildClassifier(train);
1206: } catch (Exception ex) {
1207: throw new Error("Error setting up for tests: "
1208: + ex.getMessage());
1209: }
1210: try {
1211:
1212: // Now set test values to missing when predicting
1213: for (int i = 0; i < test.numInstances(); i++) {
1214: Instance testInst = test.instance(i);
1215: Instance classMissingInst = (Instance) testInst.copy();
1216: classMissingInst.setDataset(test);
1217: classMissingInst.setClassMissing();
1218: double[] dist0 = classifiers[0]
1219: .distributionForInstance(testInst);
1220: double[] dist1 = classifiers[1]
1221: .distributionForInstance(classMissingInst);
1222: for (int j = 0; j < dist0.length; j++) {
1223: // ignore, if both are NaNs
1224: if (Double.isNaN(dist0[j])
1225: && Double.isNaN(dist1[j])) {
1226: if (getDebug())
1227: System.out
1228: .println("Both predictions are NaN!");
1229: continue;
1230: }
1231: // distribution different?
1232: if (dist0[j] != dist1[j]) {
1233: throw new Exception(
1234: "Prediction different for instance "
1235: + (i + 1));
1236: }
1237: }
1238: }
1239:
1240: println("yes");
1241: result[0] = true;
1242: } catch (Exception ex) {
1243: println("no");
1244: result[0] = false;
1245:
1246: if (m_Debug) {
1247: println("\n=== Full Report ===");
1248:
1249: if (evalFail) {
1250: println("Results differ between non-missing and "
1251: + "missing test class values.");
1252: } else {
1253: print("Problem during testing");
1254: println(": " + ex.getMessage() + "\n");
1255: }
1256: println("Here are the datasets:\n");
1257: println("=== Train Dataset ===\n" + train.toString()
1258: + "\n");
1259: println("=== Train Weights ===\n");
1260: for (int i = 0; i < train.numInstances(); i++) {
1261: println(" " + (i + 1) + " "
1262: + train.instance(i).weight());
1263: }
1264: println("=== Test Dataset ===\n" + test.toString()
1265: + "\n\n");
1266: println("(test weights all 1.0\n");
1267: }
1268: }
1269:
1270: return result;
1271: }
1272:
1273: /**
1274: * Checks whether the classifier can handle instance weights.
1275: * This test compares the classifier performance on two datasets
1276: * that are identical except for the training weights. If the
1277: * results change, then the classifier must be using the weights. It
1278: * may be possible to get a false positive from this test if the
1279: * weight changes aren't significant enough to induce a change
1280: * in classifier performance (but the weights are chosen to minimize
1281: * the likelihood of this).
1282: *
1283: * @param nominalPredictor if true use nominal predictor attributes
1284: * @param numericPredictor if true use numeric predictor attributes
1285: * @param stringPredictor if true use string predictor attributes
1286: * @param datePredictor if true use date predictor attributes
1287: * @param relationalPredictor if true use relational predictor attributes
1288: * @param multiInstance whether multi-instance is needed
1289: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1290: * @return index 0 true if the test was passed
1291: */
1292: protected boolean[] instanceWeights(boolean nominalPredictor,
1293: boolean numericPredictor, boolean stringPredictor,
1294: boolean datePredictor, boolean relationalPredictor,
1295: boolean multiInstance, int classType) {
1296:
1297: print("classifier uses instance weights");
1298: printAttributeSummary(nominalPredictor, numericPredictor,
1299: stringPredictor, datePredictor, relationalPredictor,
1300: multiInstance, classType);
1301: print("...");
1302: int numTrain = 2 * getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 0;
1303: boolean predictorMissing = false, classMissing = false;
1304:
1305: boolean[] result = new boolean[2];
1306: Instances train = null;
1307: Instances test = null;
1308: Classifier[] classifiers = null;
1309: Evaluation evaluationB = null;
1310: Evaluation evaluationI = null;
1311: boolean built = false;
1312: boolean evalFail = false;
1313: try {
1314: train = makeTestDataset(42, numTrain,
1315: nominalPredictor ? getNumNominal() + 1 : 0,
1316: numericPredictor ? getNumNumeric() + 1 : 0,
1317: stringPredictor ? getNumString() : 0,
1318: datePredictor ? getNumDate() : 0,
1319: relationalPredictor ? getNumRelational() : 0,
1320: numClasses, classType, multiInstance);
1321: test = makeTestDataset(24, numTest,
1322: nominalPredictor ? getNumNominal() + 1 : 0,
1323: numericPredictor ? getNumNumeric() + 1 : 0,
1324: stringPredictor ? getNumString() : 0,
1325: datePredictor ? getNumDate() : 0,
1326: relationalPredictor ? getNumRelational() : 0,
1327: numClasses, classType, multiInstance);
1328: if (missingLevel > 0) {
1329: addMissing(train, missingLevel, predictorMissing,
1330: classMissing);
1331: addMissing(test, Math.min(missingLevel, 50),
1332: predictorMissing, classMissing);
1333: }
1334: classifiers = Classifier.makeCopies(getClassifier(), 2);
1335: evaluationB = new Evaluation(train);
1336: evaluationI = new Evaluation(train);
1337: classifiers[0].buildClassifier(train);
1338: testWRTZeroR(classifiers[0], evaluationB, train, test);
1339: } catch (Exception ex) {
1340: throw new Error("Error setting up for tests: "
1341: + ex.getMessage());
1342: }
1343: try {
1344:
1345: // Now modify instance weights and re-built/test
1346: for (int i = 0; i < train.numInstances(); i++) {
1347: train.instance(i).setWeight(0);
1348: }
1349: Random random = new Random(1);
1350: for (int i = 0; i < train.numInstances() / 2; i++) {
1351: int inst = Math.abs(random.nextInt())
1352: % train.numInstances();
1353: int weight = Math.abs(random.nextInt()) % 10 + 1;
1354: train.instance(inst).setWeight(weight);
1355: }
1356: classifiers[1].buildClassifier(train);
1357: built = true;
1358: testWRTZeroR(classifiers[1], evaluationI, train, test);
1359: if (evaluationB.equals(evaluationI)) {
1360: // println("no");
1361: evalFail = true;
1362: throw new Exception("evalFail");
1363: }
1364:
1365: println("yes");
1366: result[0] = true;
1367: } catch (Exception ex) {
1368: println("no");
1369: result[0] = false;
1370:
1371: if (m_Debug) {
1372: println("\n=== Full Report ===");
1373:
1374: if (evalFail) {
1375: println("Results don't differ between non-weighted and "
1376: + "weighted instance models.");
1377: println("Here are the results:\n");
1378: println(evaluationB.toSummaryString(
1379: "\nboth methods\n", true));
1380: } else {
1381: print("Problem during");
1382: if (built) {
1383: print(" testing");
1384: } else {
1385: print(" training");
1386: }
1387: println(": " + ex.getMessage() + "\n");
1388: }
1389: println("Here are the datasets:\n");
1390: println("=== Train Dataset ===\n" + train.toString()
1391: + "\n");
1392: println("=== Train Weights ===\n");
1393: for (int i = 0; i < train.numInstances(); i++) {
1394: println(" " + (i + 1) + " "
1395: + train.instance(i).weight());
1396: }
1397: println("=== Test Dataset ===\n" + test.toString()
1398: + "\n\n");
1399: println("(test weights all 1.0\n");
1400: }
1401: }
1402:
1403: return result;
1404: }
1405:
1406: /**
1407: * Checks whether the scheme alters the training dataset during
1408: * training. If the scheme needs to modify the training
1409: * data it should take a copy of the training data. Currently checks
1410: * for changes to header structure, number of instances, order of
1411: * instances, instance weights.
1412: *
1413: * @param nominalPredictor if true use nominal predictor attributes
1414: * @param numericPredictor if true use numeric predictor attributes
1415: * @param stringPredictor if true use string predictor attributes
1416: * @param datePredictor if true use date predictor attributes
1417: * @param relationalPredictor if true use relational predictor attributes
1418: * @param multiInstance whether multi-instance is needed
1419: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1420: * @param predictorMissing true if we know the classifier can handle
1421: * (at least) moderate missing predictor values
1422: * @param classMissing true if we know the classifier can handle
1423: * (at least) moderate missing class values
1424: * @return index 0 is true if the test was passed
1425: */
1426: protected boolean[] datasetIntegrity(boolean nominalPredictor,
1427: boolean numericPredictor, boolean stringPredictor,
1428: boolean datePredictor, boolean relationalPredictor,
1429: boolean multiInstance, int classType,
1430: boolean predictorMissing, boolean classMissing) {
1431:
1432: print("classifier doesn't alter original datasets");
1433: printAttributeSummary(nominalPredictor, numericPredictor,
1434: stringPredictor, datePredictor, relationalPredictor,
1435: multiInstance, classType);
1436: print("...");
1437: int numTrain = getNumInstances(), numTest = getNumInstances(), numClasses = 2, missingLevel = 20;
1438:
1439: boolean[] result = new boolean[2];
1440: Instances train = null;
1441: Instances test = null;
1442: Classifier classifier = null;
1443: Evaluation evaluation = null;
1444: boolean built = false;
1445: try {
1446: train = makeTestDataset(42, numTrain,
1447: nominalPredictor ? getNumNominal() : 0,
1448: numericPredictor ? getNumNumeric() : 0,
1449: stringPredictor ? getNumString() : 0,
1450: datePredictor ? getNumDate() : 0,
1451: relationalPredictor ? getNumRelational() : 0,
1452: numClasses, classType, multiInstance);
1453: test = makeTestDataset(24, numTest,
1454: nominalPredictor ? getNumNominal() : 0,
1455: numericPredictor ? getNumNumeric() : 0,
1456: stringPredictor ? getNumString() : 0,
1457: datePredictor ? getNumDate() : 0,
1458: relationalPredictor ? getNumRelational() : 0,
1459: numClasses, classType, multiInstance);
1460: if (missingLevel > 0) {
1461: addMissing(train, missingLevel, predictorMissing,
1462: classMissing);
1463: addMissing(test, Math.min(missingLevel, 50),
1464: predictorMissing, classMissing);
1465: }
1466: classifier = Classifier.makeCopies(getClassifier(), 1)[0];
1467: evaluation = new Evaluation(train);
1468: } catch (Exception ex) {
1469: throw new Error("Error setting up for tests: "
1470: + ex.getMessage());
1471: }
1472: try {
1473: Instances trainCopy = new Instances(train);
1474: Instances testCopy = new Instances(test);
1475: classifier.buildClassifier(trainCopy);
1476: compareDatasets(train, trainCopy);
1477: built = true;
1478: testWRTZeroR(classifier, evaluation, trainCopy, testCopy);
1479: compareDatasets(test, testCopy);
1480:
1481: println("yes");
1482: result[0] = true;
1483: } catch (Exception ex) {
1484: println("no");
1485: result[0] = false;
1486:
1487: if (m_Debug) {
1488: println("\n=== Full Report ===");
1489: print("Problem during");
1490: if (built) {
1491: print(" testing");
1492: } else {
1493: print(" training");
1494: }
1495: println(": " + ex.getMessage() + "\n");
1496: println("Here are the datasets:\n");
1497: println("=== Train Dataset ===\n" + train.toString()
1498: + "\n");
1499: println("=== Test Dataset ===\n" + test.toString()
1500: + "\n\n");
1501: }
1502: }
1503:
1504: return result;
1505: }
1506:
1507: /**
1508: * Runs a text on the datasets with the given characteristics.
1509: *
1510: * @param nominalPredictor if true use nominal predictor attributes
1511: * @param numericPredictor if true use numeric predictor attributes
1512: * @param stringPredictor if true use string predictor attributes
1513: * @param datePredictor if true use date predictor attributes
1514: * @param relationalPredictor if true use relational predictor attributes
1515: * @param multiInstance whether multi-instance is needed
1516: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1517: * @param missingLevel the percentage of missing values
1518: * @param predictorMissing true if the missing values may be in
1519: * the predictors
1520: * @param classMissing true if the missing values may be in the class
1521: * @param numTrain the number of instances in the training set
1522: * @param numTest the number of instaces in the test set
1523: * @param numClasses the number of classes
1524: * @param accepts the acceptable string in an exception
1525: * @return index 0 is true if the test was passed, index 1 is true if test
1526: * was acceptable
1527: */
1528: protected boolean[] runBasicTest(boolean nominalPredictor,
1529: boolean numericPredictor, boolean stringPredictor,
1530: boolean datePredictor, boolean relationalPredictor,
1531: boolean multiInstance, int classType, int missingLevel,
1532: boolean predictorMissing, boolean classMissing,
1533: int numTrain, int numTest, int numClasses,
1534: FastVector accepts) {
1535:
1536: return runBasicTest(nominalPredictor, numericPredictor,
1537: stringPredictor, datePredictor, relationalPredictor,
1538: multiInstance, classType, TestInstances.CLASS_IS_LAST,
1539: missingLevel, predictorMissing, classMissing, numTrain,
1540: numTest, numClasses, accepts);
1541: }
1542:
1543: /**
1544: * Runs a text on the datasets with the given characteristics.
1545: *
1546: * @param nominalPredictor if true use nominal predictor attributes
1547: * @param numericPredictor if true use numeric predictor attributes
1548: * @param stringPredictor if true use string predictor attributes
1549: * @param datePredictor if true use date predictor attributes
1550: * @param relationalPredictor if true use relational predictor attributes
1551: * @param multiInstance whether multi-instance is needed
1552: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1553: * @param classIndex the attribute index of the class
1554: * @param missingLevel the percentage of missing values
1555: * @param predictorMissing true if the missing values may be in
1556: * the predictors
1557: * @param classMissing true if the missing values may be in the class
1558: * @param numTrain the number of instances in the training set
1559: * @param numTest the number of instaces in the test set
1560: * @param numClasses the number of classes
1561: * @param accepts the acceptable string in an exception
1562: * @return index 0 is true if the test was passed, index 1 is true if test
1563: * was acceptable
1564: */
1565: protected boolean[] runBasicTest(boolean nominalPredictor,
1566: boolean numericPredictor, boolean stringPredictor,
1567: boolean datePredictor, boolean relationalPredictor,
1568: boolean multiInstance, int classType, int classIndex,
1569: int missingLevel, boolean predictorMissing,
1570: boolean classMissing, int numTrain, int numTest,
1571: int numClasses, FastVector accepts) {
1572:
1573: boolean[] result = new boolean[2];
1574: Instances train = null;
1575: Instances test = null;
1576: Classifier classifier = null;
1577: Evaluation evaluation = null;
1578: boolean built = false;
1579: try {
1580: train = makeTestDataset(42, numTrain,
1581: nominalPredictor ? getNumNominal() : 0,
1582: numericPredictor ? getNumNumeric() : 0,
1583: stringPredictor ? getNumString() : 0,
1584: datePredictor ? getNumDate() : 0,
1585: relationalPredictor ? getNumRelational() : 0,
1586: numClasses, classType, classIndex, multiInstance);
1587: test = makeTestDataset(24, numTest,
1588: nominalPredictor ? getNumNominal() : 0,
1589: numericPredictor ? getNumNumeric() : 0,
1590: stringPredictor ? getNumString() : 0,
1591: datePredictor ? getNumDate() : 0,
1592: relationalPredictor ? getNumRelational() : 0,
1593: numClasses, classType, classIndex, multiInstance);
1594: if (missingLevel > 0) {
1595: addMissing(train, missingLevel, predictorMissing,
1596: classMissing);
1597: addMissing(test, Math.min(missingLevel, 50),
1598: predictorMissing, classMissing);
1599: }
1600: classifier = Classifier.makeCopies(getClassifier(), 1)[0];
1601: evaluation = new Evaluation(train);
1602: } catch (Exception ex) {
1603: ex.printStackTrace();
1604: throw new Error("Error setting up for tests: "
1605: + ex.getMessage());
1606: }
1607: try {
1608: classifier.buildClassifier(train);
1609: built = true;
1610: if (!testWRTZeroR(classifier, evaluation, train, test)[0]) {
1611: result[0] = true;
1612: result[1] = true;
1613: throw new Exception("Scheme performs worse than ZeroR");
1614: }
1615:
1616: println("yes");
1617: result[0] = true;
1618: } catch (Exception ex) {
1619: boolean acceptable = false;
1620: String msg;
1621: if (ex.getMessage() == null)
1622: msg = "";
1623: else
1624: msg = ex.getMessage().toLowerCase();
1625: if (msg.indexOf("not in classpath") > -1)
1626: m_ClasspathProblems = true;
1627: if (msg.indexOf("worse than zeror") >= 0) {
1628: println("warning: performs worse than ZeroR");
1629: result[0] = true;
1630: result[1] = true;
1631: } else {
1632: for (int i = 0; i < accepts.size(); i++) {
1633: if (msg.indexOf((String) accepts.elementAt(i)) >= 0) {
1634: acceptable = true;
1635: }
1636: }
1637:
1638: println("no"
1639: + (acceptable ? " (OK error message)" : ""));
1640: result[1] = acceptable;
1641: }
1642:
1643: if (m_Debug) {
1644: println("\n=== Full Report ===");
1645: print("Problem during");
1646: if (built) {
1647: print(" testing");
1648: } else {
1649: print(" training");
1650: }
1651: println(": " + ex.getMessage() + "\n");
1652: if (!acceptable) {
1653: if (accepts.size() > 0) {
1654: print("Error message doesn't mention ");
1655: for (int i = 0; i < accepts.size(); i++) {
1656: if (i != 0) {
1657: print(" or ");
1658: }
1659: print('"' + (String) accepts.elementAt(i) + '"');
1660: }
1661: }
1662: println("here are the datasets:\n");
1663: println("=== Train Dataset ===\n"
1664: + train.toString() + "\n");
1665: println("=== Test Dataset ===\n" + test.toString()
1666: + "\n\n");
1667: }
1668: }
1669: }
1670:
1671: return result;
1672: }
1673:
1674: /**
1675: * Determine whether the scheme performs worse than ZeroR during testing
1676: *
1677: * @param classifier the pre-trained classifier
1678: * @param evaluation the classifier evaluation object
1679: * @param train the training data
1680: * @param test the test data
1681: * @return index 0 is true if the scheme performs better than ZeroR
1682: * @throws Exception if there was a problem during the scheme's testing
1683: */
1684: protected boolean[] testWRTZeroR(Classifier classifier,
1685: Evaluation evaluation, Instances train, Instances test)
1686: throws Exception {
1687:
1688: boolean[] result = new boolean[2];
1689:
1690: evaluation.evaluateModel(classifier, test);
1691: try {
1692:
1693: // Tested OK, compare with ZeroR
1694: Classifier zeroR = new weka.classifiers.rules.ZeroR();
1695: zeroR.buildClassifier(train);
1696: Evaluation zeroREval = new Evaluation(train);
1697: zeroREval.evaluateModel(zeroR, test);
1698: result[0] = Utils.grOrEq(zeroREval.errorRate(), evaluation
1699: .errorRate());
1700: } catch (Exception ex) {
1701: throw new Error("Problem determining ZeroR performance: "
1702: + ex.getMessage());
1703: }
1704:
1705: return result;
1706: }
1707:
1708: /**
1709: * Make a simple set of instances, which can later be modified
1710: * for use in specific tests.
1711: *
1712: * @param seed the random number seed
1713: * @param numInstances the number of instances to generate
1714: * @param numNominal the number of nominal attributes
1715: * @param numNumeric the number of numeric attributes
1716: * @param numString the number of string attributes
1717: * @param numDate the number of date attributes
1718: * @param numRelational the number of relational attributes
1719: * @param numClasses the number of classes (if nominal class)
1720: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1721: * @param multiInstance whether the dataset should a multi-instance dataset
1722: * @return the test dataset
1723: * @throws Exception if the dataset couldn't be generated
1724: * @see #process(Instances)
1725: */
1726: protected Instances makeTestDataset(int seed, int numInstances,
1727: int numNominal, int numNumeric, int numString, int numDate,
1728: int numRelational, int numClasses, int classType,
1729: boolean multiInstance) throws Exception {
1730:
1731: return makeTestDataset(seed, numInstances, numNominal,
1732: numNumeric, numString, numDate, numRelational,
1733: numClasses, classType, TestInstances.CLASS_IS_LAST,
1734: multiInstance);
1735: }
1736:
1737: /**
1738: * Make a simple set of instances with variable position of the class
1739: * attribute, which can later be modified for use in specific tests.
1740: *
1741: * @param seed the random number seed
1742: * @param numInstances the number of instances to generate
1743: * @param numNominal the number of nominal attributes
1744: * @param numNumeric the number of numeric attributes
1745: * @param numString the number of string attributes
1746: * @param numDate the number of date attributes
1747: * @param numRelational the number of relational attributes
1748: * @param numClasses the number of classes (if nominal class)
1749: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1750: * @param classIndex the index of the class (0-based, -1 as last)
1751: * @param multiInstance whether the dataset should a multi-instance dataset
1752: * @return the test dataset
1753: * @throws Exception if the dataset couldn't be generated
1754: * @see TestInstances#CLASS_IS_LAST
1755: * @see #process(Instances)
1756: */
1757: protected Instances makeTestDataset(int seed, int numInstances,
1758: int numNominal, int numNumeric, int numString, int numDate,
1759: int numRelational, int numClasses, int classType,
1760: int classIndex, boolean multiInstance) throws Exception {
1761:
1762: TestInstances dataset = new TestInstances();
1763:
1764: dataset.setSeed(seed);
1765: dataset.setNumInstances(numInstances);
1766: dataset.setNumNominal(numNominal);
1767: dataset.setNumNumeric(numNumeric);
1768: dataset.setNumString(numString);
1769: dataset.setNumDate(numDate);
1770: dataset.setNumRelational(numRelational);
1771: dataset.setNumClasses(numClasses);
1772: dataset.setClassType(classType);
1773: dataset.setClassIndex(classIndex);
1774: dataset.setNumClasses(numClasses);
1775: dataset.setMultiInstance(multiInstance);
1776: dataset.setWords(getWords());
1777: dataset.setWordSeparators(getWordSeparators());
1778:
1779: return process(dataset.generate());
1780: }
1781:
1782: /**
1783: * Print out a short summary string for the dataset characteristics
1784: *
1785: * @param nominalPredictor true if nominal predictor attributes are present
1786: * @param numericPredictor true if numeric predictor attributes are present
1787: * @param stringPredictor true if string predictor attributes are present
1788: * @param datePredictor true if date predictor attributes are present
1789: * @param relationalPredictor true if relational predictor attributes are present
1790: * @param multiInstance whether multi-instance is needed
1791: * @param classType the class type (NUMERIC, NOMINAL, etc.)
1792: */
1793: protected void printAttributeSummary(boolean nominalPredictor,
1794: boolean numericPredictor, boolean stringPredictor,
1795: boolean datePredictor, boolean relationalPredictor,
1796: boolean multiInstance, int classType) {
1797:
1798: String str = "";
1799:
1800: if (numericPredictor)
1801: str += " numeric";
1802:
1803: if (nominalPredictor) {
1804: if (str.length() > 0)
1805: str += " &";
1806: str += " nominal";
1807: }
1808:
1809: if (stringPredictor) {
1810: if (str.length() > 0)
1811: str += " &";
1812: str += " string";
1813: }
1814:
1815: if (datePredictor) {
1816: if (str.length() > 0)
1817: str += " &";
1818: str += " date";
1819: }
1820:
1821: if (relationalPredictor) {
1822: if (str.length() > 0)
1823: str += " &";
1824: str += " relational";
1825: }
1826:
1827: str += " predictors)";
1828:
1829: switch (classType) {
1830: case Attribute.NUMERIC:
1831: str = " (numeric class," + str;
1832: break;
1833: case Attribute.NOMINAL:
1834: str = " (nominal class," + str;
1835: break;
1836: case Attribute.STRING:
1837: str = " (string class," + str;
1838: break;
1839: case Attribute.DATE:
1840: str = " (date class," + str;
1841: break;
1842: case Attribute.RELATIONAL:
1843: str = " (relational class," + str;
1844: break;
1845: }
1846:
1847: print(str);
1848: }
1849:
1850: /**
1851: * Test method for this class
1852: *
1853: * @param args the commandline parameters
1854: */
1855: public static void main(String[] args) {
1856: runCheck(new CheckClassifier(), args);
1857: }
1858: }
|