001: /*
002: * File : $Source: /usr/local/cvs/opencms/src-modules/org/opencms/workplace/tools/content/CmsTagReplaceParser.java,v $
003: * Date : $Date: 2008-02-27 12:05:37 $
004: * Version: $Revision: 1.4 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.workplace.tools.content;
033:
034: import org.opencms.util.CmsHtmlParser;
035: import org.opencms.util.CmsHtmlTagRemoveFactory;
036: import org.opencms.util.CmsStringUtil;
037: import org.opencms.util.I_CmsHtmlNodeVisitor;
038:
039: import java.util.Iterator;
040:
041: import org.htmlparser.NodeFactory;
042: import org.htmlparser.Parser;
043: import org.htmlparser.Tag;
044: import org.htmlparser.lexer.Lexer;
045: import org.htmlparser.lexer.Page;
046: import org.htmlparser.util.ParserException;
047:
048: /**
049: *
050: * Html parser / visitor combination that visits a document and replaces Tag names by using the
051: * replacement configuration of a {@link org.opencms.workplace.tools.content.CmsTagReplaceSettings}
052: * instance.
053: * <p>
054: *
055: * Instances are reusable.
056: * <p>
057: *
058: * @author Achim Westermann
059: *
060: * @version $Revision: 1.4 $
061: *
062: * @since 6.1.7
063: *
064: */
065: public final class CmsTagReplaceParser extends CmsHtmlParser implements
066: I_CmsHtmlNodeVisitor {
067:
068: /** A tag factory that is able to make tags invisible to visitors. */
069: private final NodeFactory m_nodeFactory;
070:
071: /**
072: * Boolean flag that is set to true if during last call to {@link #process(String, String)}
073: * content was changed.
074: */
075: private boolean m_changedContent;
076: /**
077: * The settings to use for replacing tags.
078: */
079: private final CmsTagReplaceSettings m_settings;
080:
081: /**
082: * Default constructor that turns echo on and uses the settings for replacing tags.
083: * <p>
084: *
085: * @param settings the settings to use for tag replacement.
086: */
087: public CmsTagReplaceParser(CmsTagReplaceSettings settings) {
088:
089: // echo on
090: super (true);
091: m_settings = settings;
092: CmsHtmlTagRemoveFactory nodeFactory = new CmsHtmlTagRemoveFactory();
093: // add the removals of the settings to the tag factory:
094: Iterator itDeleteTags = m_settings.getDeleteTags().iterator();
095: while (itDeleteTags.hasNext()) {
096: nodeFactory.addTagRemoval((Tag) itDeleteTags.next());
097: }
098: m_nodeFactory = nodeFactory;
099:
100: }
101:
102: /**
103: * Overridden to also return the attributes of the Tag.
104: * <p>
105: *
106: * @see org.opencms.util.CmsHtmlParser#getTagHtml(org.htmlparser.Tag)
107: */
108: public String getTagHtml(Tag tag) {
109:
110: if (CmsStringUtil.isEmpty(tag.getTagName())) {
111: return "";
112: }
113: StringBuffer result = new StringBuffer(32);
114: result.append('<');
115: // Tag name is the first "Attribute"...
116: Iterator itAttributes = tag.getAttributesEx().iterator();
117: while (itAttributes.hasNext()) {
118: result.append(itAttributes.next().toString());
119: // avoid trailing whitespaces like <H1 >
120: // in 2nd run htmlparser 1.5 would turn the whitespace into an Attribute with null name
121: if (itAttributes.hasNext()) {
122: result.append(' ');
123: }
124: }
125: result.append('>');
126: return result.toString();
127: }
128:
129: /**
130: * Extracts the text from the given html content, assuming the given html encoding.
131: * <p>
132: * Additionally tags are replaced / removed according to the configuration of this instance.
133: * <p>
134: *
135: * <h3>Please note:</h3>
136: * There are static process methods in the superclass that will not do the replacements /
137: * removals. Don't mix them up with this method.
138: * <p>
139: *
140: * @param html the content to extract the plain text from.
141: *
142: * @param encoding the encoding to use.
143: *
144: * @return the text extracted from the given html content.
145: *
146: * @throws ParserException if something goes wrong.
147: */
148: public String process(String html, String encoding)
149: throws ParserException {
150:
151: // clear from potential previous run:
152: m_result = new StringBuffer();
153: m_changedContent = false;
154:
155: // initialize a parser with the given charset
156: Parser parser = new Parser();
157: parser.setNodeFactory(m_nodeFactory);
158: Lexer lexer = new Lexer();
159: Page page = new Page(html, encoding);
160: lexer.setPage(page);
161: parser.setLexer(lexer);
162: // process the page using the given visitor
163: parser.visitAllNodesWith(this );
164: // return the result
165: return getResult();
166: }
167:
168: /**
169: * @see org.opencms.util.CmsHtmlParser#visitEndTag(org.htmlparser.Tag)
170: */
171: public void visitEndTag(Tag tag) {
172:
173: boolean change = m_settings.replace(tag);
174: if (change) {
175: m_changedContent = true;
176: }
177: super .visitEndTag(tag);
178: }
179:
180: /**
181: * @see org.opencms.util.CmsHtmlParser#visitTag(org.htmlparser.Tag)
182: */
183: public void visitTag(Tag tag) {
184:
185: boolean change = m_settings.replace(tag);
186: if (change) {
187: m_changedContent = true;
188: }
189: super .visitTag(tag);
190: }
191:
192: /**
193: * Returns the changedContent.
194: * <p>
195: *
196: * @return the changedContent
197: */
198: public boolean isChangedContent() {
199:
200: return m_changedContent;
201: }
202:
203: }
|