01: /*
02: ItsNat Java Web Application Framework
03: Copyright (C) 2007 Innowhere Software Services S.L., Spanish Company
04: Author: Jose Maria Arranz Santamaria
05:
06: This program is free software: you can redistribute it and/or modify
07: it under the terms of the GNU Affero General Public License as published by
08: the Free Software Foundation, either version 3 of the License, or
09: (at your option) any later version. See the GNU Affero General Public
10: License for more details. See the copy of the GNU Affero General Public License
11: included in this program. If not, see <http://www.gnu.org/licenses/>.
12: */
13:
14: package org.itsnat.impl.core.dom.parse;
15:
16: import org.itsnat.core.ItsNatException;
17: import org.apache.xerces.parsers.DOMParser;
18:
19: import org.xml.sax.SAXNotRecognizedException;
20: import org.xml.sax.SAXNotSupportedException;
21:
22: /**
23: *
24: * @author jmarranz
25: */
26: public class NekoHTMLParser extends ItsNatDOMParser {
27:
28: /** Creates a new instance of NekoHTMLParser */
29: public NekoHTMLParser() {
30: try {
31: parser
32: .setProperty(
33: "http://cyberneko.org/html/properties/default-encoding",
34: "UTF-8"); // "ISO-8859-1"
35: parser.setProperty(
36: "http://cyberneko.org/html/properties/names/elems",
37: "match");
38: parser.setProperty(
39: "http://cyberneko.org/html/properties/names/attrs",
40: "no-change");
41:
42: parser
43: .setFeature(
44: "http://cyberneko.org/html/features/scanner/cdata-sections",
45: true); // Para evitar que se conviertan en comentarios
46:
47: //parser.setFeature("http://cyberneko.org/html/features/balance-tags",false);
48:
49: // Podríamos quitar los namespaces para usar XHTML porque Xerces no los soporta en HTML
50: // segun se dice en:
51: // http://people.apache.org/~andyc/neko/doc/html/faq.html
52: // "Why do I get a hierarchy request error using DOM?
53: // ...The Xerces HTML DOM implementation does not support namespaces and cannot represent XHTML documents with namespace information.
54: // Therefore, in order to use the default HTML DOM implementation with NekoHTML's DOMParser to parse XHTML documents, you must turn off namespace processing.
55: // parser.setFeature("http://xml.org/sax/features/namespaces", false); "
56: // De otra manera en XHTML no crearía HTMLElements para los nodos sino Element XML
57: // El problema es que deja de funcionar el namespace "itsnat", por lo que
58: // la mejor solución es hacer una clase nueva documento HTML que redefina el método
59: // createElementNS
60:
61: parser
62: .setProperty(
63: "http://apache.org/xml/properties/dom/document-class-name",
64: HTMLDocumentImplXercesPatch.class.getName());
65: } catch (SAXNotRecognizedException ex) {
66: throw new ItsNatException(ex);
67: } catch (SAXNotSupportedException ex) {
68: throw new ItsNatException(ex);
69: }
70: }
71:
72: public DOMParser createParser() {
73: return new org.cyberneko.html.parsers.DOMParser();
74: }
75: }
|