001: /*
002: * This program is free software; you can redistribute it and/or modify
003: * it under the terms of the GNU General Public License as published by
004: * the Free Software Foundation; either version 2 of the License, or
005: * (at your option) any later version.
006: *
007: * This program is distributed in the hope that it will be useful,
008: * but WITHOUT ANY WARRANTY; without even the implied warranty of
009: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
010: * GNU General Public License for more details.
011: *
012: * You should have received a copy of the GNU General Public License
013: * along with this program; if not, write to the Free Software
014: * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
015: */
016:
017: /*
018: * BIFReader.java
019: * Copyright (C) 2003 University of Waikato, Hamilton, New Zealand
020: *
021: */
022:
023: package weka.classifiers.bayes.net;
024:
025: import weka.classifiers.bayes.BayesNet;
026: import weka.classifiers.bayes.net.estimate.DiscreteEstimatorBayes;
027: import weka.core.FastVector;
028: import weka.core.Instances;
029: import weka.core.TechnicalInformation;
030: import weka.core.TechnicalInformation.Type;
031: import weka.core.TechnicalInformation.Field;
032: import weka.core.TechnicalInformationHandler;
033: import weka.estimators.Estimator;
034:
035: import java.io.File;
036: import java.util.StringTokenizer;
037:
038: import javax.xml.parsers.DocumentBuilderFactory;
039:
040: import org.w3c.dom.CharacterData;
041: import org.w3c.dom.Document;
042: import org.w3c.dom.Element;
043: import org.w3c.dom.Node;
044: import org.w3c.dom.NodeList;
045:
046: /**
047: <!-- globalinfo-start -->
048: * Builds a description of a Bayes Net classifier stored in XML BIF 0.3 format.<br/>
049: * <br/>
050: * For more details on XML BIF see:<br/>
051: * <br/>
052: * Fabio Cozman, Marek Druzdzel, Daniel Garcia (1998). XML BIF version 0.3. URL http://www-2.cs.cmu.edu/~fgcozman/Research/InterchangeFormat/.
053: * <p/>
054: <!-- globalinfo-end -->
055: *
056: <!-- technical-bibtex-start -->
057: * BibTeX:
058: * <pre>
059: * @misc{Cozman1998,
060: * author = {Fabio Cozman and Marek Druzdzel and Daniel Garcia},
061: * title = {XML BIF version 0.3},
062: * year = {1998},
063: * URL = {http://www-2.cs.cmu.edu/~fgcozman/Research/InterchangeFormat/}
064: * }
065: * </pre>
066: * <p/>
067: <!-- technical-bibtex-end -->
068: *
069: <!-- options-start -->
070: * Valid options are: <p/>
071: *
072: * <pre> -D
073: * Do not use ADTree data structure
074: * </pre>
075: *
076: * <pre> -B <BIF file>
077: * BIF file to compare with
078: * </pre>
079: *
080: * <pre> -Q weka.classifiers.bayes.net.search.SearchAlgorithm
081: * Search algorithm
082: * </pre>
083: *
084: * <pre> -E weka.classifiers.bayes.net.estimate.SimpleEstimator
085: * Estimator algorithm
086: * </pre>
087: *
088: <!-- options-end -->
089: *
090: * @author Remco Bouckaert (rrb@xm.co.nz)
091: * @version $Revision: 1.11 $
092: */
093: public class BIFReader extends BayesNet implements
094: TechnicalInformationHandler {
095:
096: private int[] m_nPositionX;
097: private int[] m_nPositionY;
098: private int[] m_order;
099:
100: /** for serialization */
101: static final long serialVersionUID = -8358864680379881429L;
102:
103: /**
104: * This will return a string describing the classifier.
105: * @return The string.
106: */
107: public String globalInfo() {
108: return "Builds a description of a Bayes Net classifier stored in XML "
109: + "BIF 0.3 format.\n\n"
110: + "For more details on XML BIF see:\n\n"
111: + getTechnicalInformation().toString();
112: }
113:
114: /** processFile reads a BIFXML file and initializes a Bayes Net
115: * @param sFile name of the file to parse
116: * @return the BIFReader
117: * @throws Exception if processing fails
118: */
119: public BIFReader processFile(String sFile) throws Exception {
120: m_sFile = sFile;
121: DocumentBuilderFactory factory = DocumentBuilderFactory
122: .newInstance();
123: factory.setValidating(true);
124: Document doc = factory.newDocumentBuilder().parse(
125: new File(sFile));
126: doc.normalize();
127:
128: buildInstances(doc, sFile);
129: buildStructure(doc);
130: return this ;
131: } // processFile
132:
133: /** the current filename */
134: String m_sFile;
135:
136: /**
137: * returns the current filename
138: *
139: * @return the current filename
140: */
141: public String getFileName() {
142: return m_sFile;
143: }
144:
145: /**
146: * Returns an instance of a TechnicalInformation object, containing
147: * detailed information about the technical background of this class,
148: * e.g., paper reference or book this class is based on.
149: *
150: * @return the technical information about this class
151: */
152: public TechnicalInformation getTechnicalInformation() {
153: TechnicalInformation result;
154:
155: result = new TechnicalInformation(Type.MISC);
156: result.setValue(Field.AUTHOR,
157: "Fabio Cozman and Marek Druzdzel and Daniel Garcia");
158: result.setValue(Field.YEAR, "1998");
159: result.setValue(Field.TITLE, "XML BIF version 0.3");
160: result
161: .setValue(Field.URL,
162: "http://www-2.cs.cmu.edu/~fgcozman/Research/InterchangeFormat/");
163:
164: return result;
165: }
166:
167: /** buildStructure parses the BIF document in the DOM tree contained
168: * in the doc parameter and specifies the the network structure and
169: * probability tables.
170: * It assumes that buildInstances has been called before
171: * @param doc DOM document containing BIF document in DOM tree
172: * @throws Exception if building of structure fails
173: */
174: void buildStructure(Document doc) throws Exception {
175: // Get the name of the network
176: // initialize conditional distribution tables
177: m_Distributions = new Estimator[m_Instances.numAttributes()][];
178: for (int iNode = 0; iNode < m_Instances.numAttributes(); iNode++) {
179: // find definition that goes with this node
180: String sName = m_Instances.attribute(iNode).name();
181: Element definition = getDefinition(doc, sName);
182: /*
183: if (nodelist.getLength() == 0) {
184: throw new Exception("No definition found for node " + sName);
185: }
186: if (nodelist.getLength() > 1) {
187: System.err.println("More than one definition found for node " + sName + ". Using first definition.");
188: }
189: Element definition = (Element) nodelist.item(0);
190: */
191:
192: // get the parents for this node
193: // resolve structure
194: FastVector nodelist = getParentNodes(definition);
195: for (int iParent = 0; iParent < nodelist.size(); iParent++) {
196: Node parentName = ((Node) nodelist.elementAt(iParent))
197: .getFirstChild();
198: String sParentName = ((CharacterData) (parentName))
199: .getData();
200: int nParent = getNode(sParentName);
201: m_ParentSets[iNode].addParent(nParent, m_Instances);
202: }
203: // resolve conditional probability table
204: int nCardinality = m_ParentSets[iNode]
205: .getCardinalityOfParents();
206: int nValues = m_Instances.attribute(iNode).numValues();
207: m_Distributions[iNode] = new Estimator[nCardinality];
208: for (int i = 0; i < nCardinality; i++) {
209: m_Distributions[iNode][i] = new DiscreteEstimatorBayes(
210: nValues, 0.0f);
211: }
212:
213: /*
214: StringBuffer sTable = new StringBuffer();
215: for (int iText = 0; iText < nodelist.getLength(); iText++) {
216: sTable.append(((CharacterData) (nodelist.item(iText))).getData());
217: sTable.append(' ');
218: }
219: StringTokenizer st = new StringTokenizer(sTable.toString());
220: */
221: String sTable = getTable(definition);
222: StringTokenizer st = new StringTokenizer(sTable.toString());
223:
224: for (int i = 0; i < nCardinality; i++) {
225: DiscreteEstimatorBayes d = (DiscreteEstimatorBayes) m_Distributions[iNode][i];
226: for (int iValue = 0; iValue < nValues; iValue++) {
227: String sWeight = st.nextToken();
228: d.addValue(iValue, new Double(sWeight)
229: .doubleValue());
230: }
231: }
232: }
233: } // buildStructure
234:
235: /** synchronizes the node ordering of this Bayes network with
236: * those in the other network (if possible).
237: * @param other Bayes network to synchronize with
238: * @throws Exception if nr of attributes differs or not all of the variables have the same name.
239: */
240: public void Sync(BayesNet other) throws Exception {
241: int nAtts = m_Instances.numAttributes();
242: if (nAtts != other.m_Instances.numAttributes()) {
243: throw new Exception(
244: "Cannot synchronize networks: different number of attributes.");
245: }
246: m_order = new int[nAtts];
247: for (int iNode = 0; iNode < nAtts; iNode++) {
248: String sName = other.getNodeName(iNode);
249: m_order[getNode(sName)] = iNode;
250: }
251: } // Sync
252:
253: /** getNode finds the index of the node with name sNodeName
254: * and throws an exception if no such node can be found.
255: * @param sNodeName name of the node to get the index from
256: * @return index of the node with name sNodeName
257: * @throws Exception if node cannot be found
258: */
259: public int getNode(String sNodeName) throws Exception {
260: int iNode = 0;
261: while (iNode < m_Instances.numAttributes()) {
262: if (m_Instances.attribute(iNode).name().equals(sNodeName)) {
263: return iNode;
264: }
265: iNode++;
266: }
267: throw new Exception("Could not find node [[" + sNodeName + "]]");
268: } // getNode
269:
270: /**
271: * Returns all TEXT children of the given node in one string. Between
272: * the node values new lines are inserted.
273: *
274: * @param node the node to return the content for
275: * @return the content of the node
276: */
277: public String getContent(Element node) {
278: NodeList list;
279: Node item;
280: int i;
281: String result;
282:
283: result = "";
284: list = node.getChildNodes();
285:
286: for (i = 0; i < list.getLength(); i++) {
287: item = list.item(i);
288: if (item.getNodeType() == Node.TEXT_NODE)
289: result += "\n" + item.getNodeValue();
290: }
291:
292: return result;
293: }
294:
295: /** buildInstances parses the BIF document and creates a Bayes Net with its
296: * nodes specified, but leaves the network structure and probability tables empty.
297: * @param doc DOM document containing BIF document in DOM tree
298: * @param sName default name to give to the Bayes Net. Will be overridden if specified in the BIF document.
299: * @throws Exception if building fails
300: */
301: void buildInstances(Document doc, String sName) throws Exception {
302: NodeList nodelist;
303: // Get the name of the network
304: nodelist = selectAllNames(doc);
305: if (nodelist.getLength() > 0) {
306: sName = ((CharacterData) (nodelist.item(0).getFirstChild()))
307: .getData();
308: }
309:
310: // Process variables
311: nodelist = selectAllVariables(doc);
312: int nNodes = nodelist.getLength();
313: // initialize structure
314: FastVector attInfo = new FastVector(nNodes);
315:
316: // Initialize
317: m_nPositionX = new int[nodelist.getLength()];
318: m_nPositionY = new int[nodelist.getLength()];
319:
320: // Process variables
321: for (int iNode = 0; iNode < nodelist.getLength(); iNode++) {
322: // Get element
323: FastVector valueslist;
324: // Get the name of the network
325: valueslist = selectOutCome(nodelist.item(iNode));
326:
327: int nValues = valueslist.size();
328: // generate value strings
329: FastVector nomStrings = new FastVector(nValues + 1);
330: for (int iValue = 0; iValue < nValues; iValue++) {
331: Node node = ((Node) valueslist.elementAt(iValue))
332: .getFirstChild();
333: String sValue = ((CharacterData) (node)).getData();
334: if (sValue == null) {
335: sValue = "Value" + (iValue + 1);
336: }
337: nomStrings.addElement(sValue);
338: }
339: FastVector nodelist2;
340: // Get the name of the network
341: nodelist2 = selectName(nodelist.item(iNode));
342: if (nodelist2.size() == 0) {
343: throw new Exception("No name specified for variable");
344: }
345: String sNodeName = ((CharacterData) (((Node) nodelist2
346: .elementAt(0)).getFirstChild())).getData();
347:
348: weka.core.Attribute att = new weka.core.Attribute(
349: sNodeName, nomStrings);
350: attInfo.addElement(att);
351:
352: valueslist = selectProperty(nodelist.item(iNode));
353: nValues = valueslist.size();
354: // generate value strings
355: for (int iValue = 0; iValue < nValues; iValue++) {
356: // parsing for strings of the form "position = (73, 165)"
357: Node node = ((Node) valueslist.elementAt(iValue))
358: .getFirstChild();
359: String sValue = ((CharacterData) (node)).getData();
360: if (sValue.startsWith("position")) {
361: int i0 = sValue.indexOf('(');
362: int i1 = sValue.indexOf(',');
363: int i2 = sValue.indexOf(')');
364: String sX = sValue.substring(i0 + 1, i1).trim();
365: String sY = sValue.substring(i1 + 1, i2).trim();
366: try {
367: m_nPositionX[iNode] = (int) Integer
368: .parseInt(sX);
369: m_nPositionY[iNode] = (int) Integer
370: .parseInt(sY);
371: } catch (NumberFormatException e) {
372: System.err
373: .println("Wrong number format in position :("
374: + sX + "," + sY + ")");
375: m_nPositionX[iNode] = 0;
376: m_nPositionY[iNode] = 0;
377: }
378: }
379: }
380:
381: }
382:
383: m_Instances = new Instances(sName, attInfo, 100);
384: m_Instances.setClassIndex(nNodes - 1);
385: setUseADTree(false);
386: initStructure();
387: } // buildInstances
388:
389: // /** selectNodeList selects list of nodes from document specified in XPath expression
390: // * @param doc : document (or node) to query
391: // * @param sXPath : XPath expression
392: // * @return list of nodes conforming to XPath expression in doc
393: // * @throws Exception
394: // */
395: // private NodeList selectNodeList(Node doc, String sXPath) throws Exception {
396: // NodeList nodelist = org.apache.xpath.XPathAPI.selectNodeList(doc, sXPath);
397: // return nodelist;
398: // } // selectNodeList
399:
400: NodeList selectAllNames(Document doc) throws Exception {
401: //NodeList nodelist = selectNodeList(doc, "//NAME");
402: NodeList nodelist = doc.getElementsByTagName("NAME");
403: return nodelist;
404: } // selectAllNames
405:
406: NodeList selectAllVariables(Document doc) throws Exception {
407: //NodeList nodelist = selectNodeList(doc, "//VARIABLE");
408: NodeList nodelist = doc.getElementsByTagName("VARIABLE");
409: return nodelist;
410: } // selectAllVariables
411:
412: Element getDefinition(Document doc, String sName) throws Exception {
413: //NodeList nodelist = selectNodeList(doc, "//DEFINITION[normalize-space(FOR/text())=\"" + sName + "\"]");
414:
415: NodeList nodelist = doc.getElementsByTagName("DEFINITION");
416: for (int iNode = 0; iNode < nodelist.getLength(); iNode++) {
417: Node node = nodelist.item(iNode);
418: FastVector list = selectElements(node, "FOR");
419: if (list.size() > 0) {
420: Node forNode = (Node) list.elementAt(0);
421: if (getContent((Element) forNode).trim().equals(sName)) {
422: return (Element) node;
423: }
424: }
425: }
426: throw new Exception("Could not find definition for ((" + sName
427: + "))");
428: } // getDefinition
429:
430: FastVector getParentNodes(Node definition) throws Exception {
431: //NodeList nodelist = selectNodeList(definition, "GIVEN");
432: FastVector nodelist = selectElements(definition, "GIVEN");
433: return nodelist;
434: } // getParentNodes
435:
436: String getTable(Node definition) throws Exception {
437: //NodeList nodelist = selectNodeList(definition, "TABLE/text()");
438: FastVector nodelist = selectElements(definition, "TABLE");
439: String sTable = getContent((Element) nodelist.elementAt(0));
440: sTable = sTable.replaceAll("\\n", " ");
441: return sTable;
442: } // getTable
443:
444: FastVector selectOutCome(Node item) throws Exception {
445: //NodeList nodelist = selectNodeList(item, "OUTCOME");
446: FastVector nodelist = selectElements(item, "OUTCOME");
447: return nodelist;
448: } // selectOutCome
449:
450: FastVector selectName(Node item) throws Exception {
451: //NodeList nodelist = selectNodeList(item, "NAME");
452: FastVector nodelist = selectElements(item, "NAME");
453: return nodelist;
454: } // selectName
455:
456: FastVector selectProperty(Node item) throws Exception {
457: // NodeList nodelist = selectNodeList(item, "PROPERTY");
458: FastVector nodelist = selectElements(item, "PROPERTY");
459: return nodelist;
460: } // selectProperty
461:
462: FastVector selectElements(Node item, String sElement)
463: throws Exception {
464: NodeList children = item.getChildNodes();
465: FastVector nodelist = new FastVector();
466: for (int iNode = 0; iNode < children.getLength(); iNode++) {
467: Node node = children.item(iNode);
468: if ((node.getNodeType() == Node.ELEMENT_NODE)
469: && node.getNodeName().equals(sElement)) {
470: nodelist.addElement(node);
471: }
472: }
473: return nodelist;
474: } // selectElements
475:
476: /** Count nr of arcs missing from other network compared to current network
477: * Note that an arc is not 'missing' if it is reversed.
478: * @param other network to compare with
479: * @return nr of missing arcs
480: */
481: public int missingArcs(BayesNet other) {
482: try {
483: Sync(other);
484: int nMissing = 0;
485: for (int iAttribute = 0; iAttribute < m_Instances
486: .numAttributes(); iAttribute++) {
487: for (int iParent = 0; iParent < m_ParentSets[iAttribute]
488: .getNrOfParents(); iParent++) {
489: int nParent = m_ParentSets[iAttribute]
490: .getParent(iParent);
491: if (!other.getParentSet(m_order[iAttribute])
492: .contains(m_order[nParent])
493: && !other.getParentSet(m_order[nParent])
494: .contains(m_order[iAttribute])) {
495: nMissing++;
496: }
497: }
498: }
499: return nMissing;
500: } catch (Exception e) {
501: System.err.println(e.getMessage());
502: return 0;
503: }
504: } // missingArcs
505:
506: /** Count nr of exta arcs from other network compared to current network
507: * Note that an arc is not 'extra' if it is reversed.
508: * @param other network to compare with
509: * @return nr of missing arcs
510: */
511: public int extraArcs(BayesNet other) {
512: try {
513: Sync(other);
514: int nExtra = 0;
515: for (int iAttribute = 0; iAttribute < m_Instances
516: .numAttributes(); iAttribute++) {
517: for (int iParent = 0; iParent < other.getParentSet(
518: m_order[iAttribute]).getNrOfParents(); iParent++) {
519: int nParent = m_order[other.getParentSet(
520: m_order[iAttribute]).getParent(iParent)];
521: if (!m_ParentSets[iAttribute].contains(nParent)
522: && !m_ParentSets[nParent]
523: .contains(iAttribute)) {
524: nExtra++;
525: }
526: }
527: }
528: return nExtra;
529: } catch (Exception e) {
530: System.err.println(e.getMessage());
531: return 0;
532: }
533: } // extraArcs
534:
535: /** calculates the divergence between the probability distribution
536: * represented by this network and that of another, that is,
537: * \sum_{x\in X} P(x)log P(x)/Q(x)
538: * where X is the set of values the nodes in the network can take,
539: * P(x) the probability of this network for configuration x
540: * Q(x) the probability of the other network for configuration x
541: * @param other network to compare with
542: * @return divergence between this and other Bayes Network
543: */
544: public double divergence(BayesNet other) {
545: try {
546: Sync(other);
547: // D: divergence
548: double D = 0.0;
549: int nNodes = m_Instances.numAttributes();
550: int[] nCard = new int[nNodes];
551: for (int iNode = 0; iNode < nNodes; iNode++) {
552: nCard[iNode] = m_Instances.attribute(iNode).numValues();
553: }
554: // x: holds current configuration of nodes
555: int[] x = new int[nNodes];
556: // simply sum over all configurations to calc divergence D
557: int i = 0;
558: while (i < nNodes) {
559: // update configuration
560: x[i]++;
561: while (i < nNodes
562: && x[i] == m_Instances.attribute(i).numValues()) {
563: x[i] = 0;
564: i++;
565: if (i < nNodes) {
566: x[i]++;
567: }
568: }
569: if (i < nNodes) {
570: i = 0;
571: // calc P(x) and Q(x)
572: double P = 1.0;
573: for (int iNode = 0; iNode < nNodes; iNode++) {
574: int iCPT = 0;
575: for (int iParent = 0; iParent < m_ParentSets[iNode]
576: .getNrOfParents(); iParent++) {
577: int nParent = m_ParentSets[iNode]
578: .getParent(iParent);
579: iCPT = iCPT * nCard[nParent] + x[nParent];
580: }
581: P = P
582: * m_Distributions[iNode][iCPT]
583: .getProbability(x[iNode]);
584: }
585:
586: double Q = 1.0;
587: for (int iNode = 0; iNode < nNodes; iNode++) {
588: int iCPT = 0;
589: for (int iParent = 0; iParent < other
590: .getParentSet(m_order[iNode])
591: .getNrOfParents(); iParent++) {
592: int nParent = m_order[other.getParentSet(
593: m_order[iNode]).getParent(iParent)];
594: iCPT = iCPT * nCard[nParent] + x[nParent];
595: }
596: Q = Q
597: * other.m_Distributions[m_order[iNode]][iCPT]
598: .getProbability(x[iNode]);
599: }
600:
601: // update divergence if probabilities are positive
602: if (P > 0.0 && Q > 0.0) {
603: D = D + P * Math.log(Q / P);
604: }
605: }
606: }
607: return D;
608: } catch (Exception e) {
609: System.err.println(e.getMessage());
610: return 0;
611: }
612: } // divergence
613:
614: /** Count nr of reversed arcs from other network compared to current network
615: * @param other network to compare with
616: * @return nr of missing arcs
617: */
618: public int reversedArcs(BayesNet other) {
619: try {
620: Sync(other);
621: int nReversed = 0;
622: for (int iAttribute = 0; iAttribute < m_Instances
623: .numAttributes(); iAttribute++) {
624: for (int iParent = 0; iParent < m_ParentSets[iAttribute]
625: .getNrOfParents(); iParent++) {
626: int nParent = m_ParentSets[iAttribute]
627: .getParent(iParent);
628: if (!other.getParentSet(m_order[iAttribute])
629: .contains(m_order[nParent])
630: && other.getParentSet(m_order[nParent])
631: .contains(m_order[iAttribute])) {
632: nReversed++;
633: }
634: }
635: }
636: return nReversed;
637: } catch (Exception e) {
638: System.err.println(e.getMessage());
639: return 0;
640: }
641: } // reversedArcs
642:
643: /**
644: * the default constructor
645: */
646: public BIFReader() {
647: }
648:
649: /**
650: * Loads the file specified as first parameter and prints it to stdout.
651: *
652: * @param args the command line parameters
653: */
654: public static void main(String[] args) {
655: try {
656: BIFReader br = new BIFReader();
657: br.processFile(args[0]);
658: System.out.println(br.toString());
659:
660: } catch (Throwable t) {
661: t.printStackTrace();
662: }
663: } // main
664: } // class BIFReader
|