001: /*
002: * File : $Source: /usr/local/cvs/opencms/src-modules/org/opencms/workplace/tools/database/CmsHtmlImportConverter.java,v $
003: * Date : $Date: 2008-02-27 12:05:51 $
004: * Version: $Revision: 1.13 $
005: *
006: * This library is part of OpenCms -
007: * the Open Source Content Management System
008: *
009: * Copyright (c) 2002 - 2008 Alkacon Software GmbH (http://www.alkacon.com)
010: *
011: * This library is free software; you can redistribute it and/or
012: * modify it under the terms of the GNU Lesser General Public
013: * License as published by the Free Software Foundation; either
014: * version 2.1 of the License, or (at your option) any later version.
015: *
016: * This library is distributed in the hope that it will be useful,
017: * but WITHOUT ANY WARRANTY; without even the implied warranty of
018: * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
019: * Lesser General Public License for more details.
020: *
021: * For further information about Alkacon Software GmbH, please see the
022: * company website: http://www.alkacon.com
023: *
024: * For further information about OpenCms, please see the
025: * project website: http://www.opencms.org
026: *
027: * You should have received a copy of the GNU Lesser General Public
028: * License along with this library; if not, write to the Free Software
029: * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
030: */
031:
032: package org.opencms.workplace.tools.database;
033:
034: import org.opencms.file.CmsPropertyDefinition;
035: import org.opencms.i18n.CmsEncoder;
036: import org.opencms.main.CmsLog;
037: import org.opencms.util.CmsStringUtil;
038:
039: import java.io.ByteArrayInputStream;
040: import java.io.ByteArrayOutputStream;
041: import java.io.IOException;
042: import java.io.InputStream;
043: import java.io.PrintWriter;
044: import java.io.Reader;
045: import java.io.StringReader;
046: import java.io.StringWriter;
047: import java.io.UnsupportedEncodingException;
048: import java.io.Writer;
049: import java.util.HashSet;
050: import java.util.Hashtable;
051: import java.util.StringTokenizer;
052: import java.util.regex.Matcher;
053: import java.util.regex.Pattern;
054:
055: import org.w3c.dom.Document;
056: import org.w3c.dom.NamedNodeMap;
057: import org.w3c.dom.Node;
058: import org.w3c.dom.NodeList;
059: import org.w3c.tidy.Tidy;
060:
061: /**
062: * This class implements Html-converting routines based on tidy to modify the
063: * Html code of the imported Html pages.<p>
064: *
065: * @author Michael Emmerich
066: *
067: * @version $Revision: 1.13 $
068: *
069: * @since 6.0.0
070: */
071: public class CmsHtmlImportConverter {
072:
073: /** defintition of the alt attribute. */
074: private static final String ATTRIB_ALT = "alt";
075:
076: /** defintition of the content attribute. */
077: private static final String ATTRIB_CONTENT = "content";
078:
079: /** defintition of the href attribute. */
080: private static final String ATTRIB_HREF = "href";
081:
082: /** defintition of the name attribute. */
083: private static final String ATTRIB_NAME = "name";
084:
085: /** defintition of the src attribute. */
086: private static final String ATTRIB_SRC = "src";
087:
088: /** defintition of the <BODY></BODY> node. */
089: private static final String NODE_BODY = "body";
090:
091: /** defintition of the <HEAD></HEAD> node. */
092: private static final String NODE_HEAD = "head";
093:
094: /** defintition of the <A></A> node. */
095: private static final String NODE_HREF = "a";
096:
097: /** defintition of the <HTML></HTML> node. */
098: private static final String NODE_HTML = "html";
099:
100: /** defintition of the <IMG></IMG> node. */
101: private static final String NODE_IMG = "img";
102:
103: /** defintition of the <META></META> node. */
104: private static final String NODE_META = "meta";
105:
106: /** defintition of the <TITLE></TITLE> node. */
107: private static final String NODE_TITLE = "title";
108:
109: /**
110: * HashMap stores tag names, after the end-tag, a "\n" is added to the output.<p>
111: */
112: private HashSet m_enterTags = new HashSet();
113:
114: /**
115: * the absolute path in the real filesystem of the file to convert.
116: */
117: private String m_filename;
118:
119: /**
120: * reference to the HtmlImport object, required to access the link translation.
121: */
122: private CmsHtmlImport m_htmlImport;
123:
124: /**
125: * temporary buffer used in transformation method.
126: */
127: private StringBuffer m_tempString;
128:
129: /** instance of JTidy. */
130: private Tidy m_tidy = new Tidy();
131:
132: /** flag to write the output. */
133: private boolean m_write;
134:
135: /**
136: * Default constructor, creates a new HtmlConverter.<p>
137: *
138: * @param htmlImport reference to the htmlimport
139: * @param xmlMode switch for setting the import to HTML or XML mode
140: */
141: public CmsHtmlImportConverter(CmsHtmlImport htmlImport,
142: boolean xmlMode) {
143:
144: m_tidy.setTidyMark(false);
145: m_tidy.setShowWarnings(false);
146: m_tidy.setQuiet(true);
147: m_tidy.setForceOutput(true);
148:
149: if (xmlMode) {
150: m_tidy.setXmlTags(xmlMode);
151: m_tidy.setXmlSpace(true);
152: }
153:
154: initialiseTags();
155: m_htmlImport = htmlImport;
156: }
157:
158: /**
159: * Extracts the content of a HTML page.<p>
160: *
161: * This method should be pretty robust and work even if the input HTML does not contains
162: * the specified matchers.<p>
163: *
164: * @param content the content to extract the body from
165: * @param startpoint the point where matching starts
166: * @param endpoint the point where matching ends
167: * @return the extracted body tag content
168: */
169: public static String extractHtml(String content, String startpoint,
170: String endpoint) {
171:
172: /** Regex that matches a start body tag. */
173: Pattern startPattern = Pattern.compile(startpoint,
174: Pattern.CASE_INSENSITIVE);
175:
176: /** Regex that matches an end body tag. */
177: Pattern endPattern = Pattern.compile(endpoint,
178: Pattern.CASE_INSENSITIVE);
179:
180: Matcher startMatcher = startPattern.matcher(content);
181: Matcher endMatcher = endPattern.matcher(content);
182:
183: int start = 0;
184: int end = content.length();
185:
186: if (startMatcher.find()) {
187: start = startMatcher.end();
188: }
189:
190: if (endMatcher.find(start)) {
191: end = endMatcher.start();
192: }
193:
194: return content.substring(start, end);
195: }
196:
197: /**
198: * Transforms HTML code into user defined output.<p>
199: *
200: * @param input Reader with HTML code
201: * @param output Writer with transformed code
202: * @param startPattern the start pattern definition for content extracting
203: * @param endPattern the end pattern definition for content extracting
204: * @param properties the file properties
205: */
206: public void convertHTML(Reader input, Writer output,
207: String startPattern, String endPattern, Hashtable properties) {
208:
209: /* local variables */
210: StringBuffer htmlString = new StringBuffer();
211: Node node;
212: String outString = "";
213:
214: try {
215: /* write InputStream input in StringBuffer htmlString */
216: int c;
217: while ((c = input.read()) != -1) {
218: htmlString.append((char) c);
219: }
220: } catch (IOException e) {
221: if (CmsLog.INIT.isWarnEnabled()) {
222: CmsLog.INIT.warn(Messages.get().getBundle().key(
223: Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0,
224: e.getLocalizedMessage()));
225: }
226: return;
227: }
228: outString = htmlString.toString();
229: // extract from html if even both patterns are defined
230: if (CmsStringUtil.isNotEmpty(startPattern)
231: && CmsStringUtil.isNotEmpty(endPattern)) {
232: String extractMain = extractHtml(outString, startPattern,
233: endPattern);
234: if (extractMain.length() != outString.length()) {
235: String extractHead = extractHtml(outString, "<html>",
236: CmsStringUtil.BODY_START_REGEX);
237: //String extractHead = extractHtml(extractMain, "<html>", CmsStringUtil.C_BODY_START_REGEX);
238: StringBuffer buffer = new StringBuffer(extractHead
239: .length()
240: + extractMain.length() + 255);
241: buffer.append("<html>");
242: buffer.append(extractHead);
243: buffer.append("<body>");
244: buffer.append(extractMain);
245: buffer.append("</body></html>");
246: outString = buffer.toString();
247: }
248: }
249:
250: /* convert htmlString in InputStream for parseDOM */
251: InputStream in;
252: try {
253: in = new ByteArrayInputStream(outString
254: .getBytes(CmsEncoder.ENCODING_UTF_8));
255: } catch (UnsupportedEncodingException e) {
256: // this should never happen since UTF-8 is always supported
257: in = new ByteArrayInputStream(outString.getBytes());
258: }
259: m_tidy.setInputEncoding(CmsEncoder.ENCODING_UTF_8);
260: m_tidy.setOutputEncoding(CmsEncoder.ENCODING_UTF_8);
261:
262: // hold tidy error information into a new PrintWriter Object
263: PrintWriter errorLog = new PrintWriter(
264: new ByteArrayOutputStream(), true);
265: m_tidy.setErrout(errorLog);
266:
267: node = m_tidy.parseDOM(in, null);
268: /* check if html code has errors */
269: if (m_tidy.getParseErrors() != 0) {
270: if (CmsLog.INIT.isWarnEnabled()) {
271: CmsLog.INIT.warn(Messages.get().getBundle().key(
272: Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_0));
273: }
274: }
275: /* second step: create transformed output with printDocument from DOM */
276: this .printDocument(node, properties);
277:
278: try {
279: String content = m_tempString.toString();
280: content = CmsStringUtil.substitute(content, "<br></br>",
281: "<br>");
282: content = CmsStringUtil.substitutePerl(content,
283: "</a>(\\w+)", "</a> $1", "g");
284: output.write(content);
285: output.close();
286:
287: } catch (IOException e) {
288: if (CmsLog.INIT.isWarnEnabled()) {
289: CmsLog.INIT.warn(Messages.get().getBundle().key(
290: Messages.LOG_HTMLIMPORT_CONVERSION_ERROR_1,
291: e.getLocalizedMessage()));
292: }
293: return;
294: }
295: }
296:
297: /**
298: * Transforms HTML code into user defined output.<p>
299: *
300: * @param filename the absolute path in the real filesystem of the file to convert
301: * @param inString String with HTML code
302: * @param startPattern the start pattern definition for content extracting
303: * @param endPattern the end pattern definition for content extracting
304: * @param properties the file properties
305: * @return String with transformed code
306: */
307: public String convertHTML(String filename, String inString,
308: String startPattern, String endPattern, Hashtable properties) {
309:
310: m_tempString = new StringBuffer();
311: m_write = true;
312: m_filename = filename.replace('\\', '/');
313: Reader in = new StringReader(inString);
314: Writer out = new StringWriter();
315: convertHTML(in, out, startPattern, endPattern, properties);
316: return out.toString();
317: }
318:
319: /**
320: * Initialises Vector m_enterTags with tag names.<p>
321: */
322: private void initialiseTags() {
323:
324: StringTokenizer T = new StringTokenizer(
325: "p,table,tr,td,body,head,script,pre,title,style,h1,h2,h3,h4,h5,h6,ul,ol,li",
326: ",");
327: while (T.hasMoreTokens()) {
328: m_enterTags.add(T.nextToken());
329: }
330: }
331:
332: /**
333: * Private method to parse DOM and create user defined output.<p>
334: *
335: * @param node Node of DOM from HTML code
336: * @param properties the file properties
337: */
338: private void printDocument(Node node, Hashtable properties) {
339:
340: // if node is empty do nothing... (Recursion)
341: if (node == null) {
342: return;
343: }
344: // initialise local variables
345: int type = node.getNodeType();
346: String name = node.getNodeName();
347:
348: // detect node type
349: switch (type) {
350: case Node.DOCUMENT_NODE:
351:
352: this .printDocument(((Document) node).getDocumentElement(),
353: properties);
354: break;
355: case Node.ELEMENT_NODE:
356:
357: // check if its the <head> node. Nothing inside the <head> node
358: // must be
359: // part of the output, but we must scan the content of this
360: // node to get all
361: // <meta> tags
362: if (name.equals(NODE_HEAD)) {
363: m_write = false;
364: }
365: // scan element node; if a block has to be removed or replaced,
366: // break and discard child nodes
367: transformStartElement(node, properties);
368:
369: // test if node has children
370: NodeList children = node.getChildNodes();
371: if (children != null) {
372: int len = children.getLength();
373: for (int i = 0; i < len; i++) {
374: // recursively call printDocument with all child nodes
375: this .printDocument(children.item(i), properties);
376: }
377: }
378: break;
379: case Node.TEXT_NODE:
380:
381: // replace subStrings in text nodes
382: transformTextNode(node);
383: break;
384: default:
385:
386: break;
387: }
388: // end of recursion, add eventual endtags and suffixes
389: switch (type) {
390: case Node.ELEMENT_NODE:
391: // analyse endtags and add them to output
392: transformEndElement(node);
393: if (node.getNodeName().equals(NODE_HEAD)) {
394: m_write = true;
395: }
396: break;
397: case Node.DOCUMENT_NODE:
398: break;
399: default:
400: break;
401: }
402: }
403:
404: /**
405: * Transform element nodes and create end tags in output.<p>
406: *
407: * @param node actual element node
408: */
409: private void transformEndElement(Node node) {
410:
411: // check hat kind of node we have
412: String nodeName = node.getNodeName();
413:
414: // the <HTML> and <BODY> node must be skipped
415: if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
416: // do nothing here
417: } else {
418: // only do some output if we are in writing mode
419: if (m_write) {
420: m_tempString.append("</");
421: m_tempString.append(nodeName);
422: m_tempString.append(">");
423:
424: // append a "\n" to output String if possible
425: if (m_enterTags.contains(node.getNodeName())) {
426: m_tempString.append("\n");
427: }
428: }
429: }
430: }
431:
432: /**
433: * Transforms element nodes and create start tags in output. <p>
434: *
435: * @param node actual element node
436: * @param properties the file properties
437: */
438: private void transformStartElement(Node node, Hashtable properties) {
439:
440: // check hat kind of node we have
441: String nodeName = node.getNodeName();
442:
443: // the <HTML> and <BODY> node must be skipped
444: if (nodeName.equals(NODE_HTML) || nodeName.equals(NODE_BODY)) {
445: // the <TITLE> node must be read and its value set as properties to
446: // the imported file
447:
448: } else if (nodeName.equals(NODE_TITLE)) {
449:
450: writeTitleProperty(node, properties);
451:
452: } else if (nodeName.equals(NODE_META)) {
453:
454: writeMetaTagProperty(node, properties);
455:
456: } else if (nodeName.equals(NODE_HREF)) {
457:
458: // only do some output if we are in writing mode
459: if (m_write) {
460: m_tempString.append("<");
461: m_tempString.append(nodeName);
462: NamedNodeMap attrs = node.getAttributes();
463: // look through all attribs to find the reference
464: for (int i = attrs.getLength() - 1; i >= 0; i--) {
465: String name = attrs.item(i).getNodeName();
466: String value = attrs.item(i).getNodeValue();
467:
468: if (name.equals(ATTRIB_HREF)) {
469:
470: // check if this is an external link
471: if (value.indexOf("://") > 0) {
472: // store it for later creation of an entry in the
473: // link gallery
474: String externalLinkFile = m_htmlImport
475: .storeExternalLink(value);
476: if (externalLinkFile != null) {
477: value = m_htmlImport.getLinkGallery()
478: + externalLinkFile;
479: }
480: } else if (!value.startsWith("mailto:")
481: && !value.startsWith("javascript:")) {
482:
483: // save an existing anchor link for later use
484: // if (value.indexOf("#") > 0) {
485: // String anchor = value.substring(value.indexOf("#"), value.length());
486: // }
487: // get the new link into the VFS
488: String internalUri = m_htmlImport
489: .getAbsoluteUri(
490: value,
491: m_filename
492: .substring(
493: 0,
494: m_filename
495: .lastIndexOf("/") + 1));
496:
497: value = m_htmlImport
498: .translateLink(internalUri);
499: }
500: }
501:
502: m_tempString.append(" ");
503: m_tempString.append(name);
504: m_tempString.append("=\"");
505: m_tempString.append(value);
506: m_tempString.append("\"");
507: }
508: m_tempString.append(">");
509: }
510:
511: // this is a imasge, its reference must be converted
512: } else if (nodeName.equals(NODE_IMG)) {
513:
514: // only do some output if we are in writing mode
515: if (m_write) {
516: m_tempString.append("<");
517: m_tempString.append(nodeName);
518: NamedNodeMap attrs = node.getAttributes();
519: // look through all attribs to find the src and alt attributes
520: String imagename = "";
521: String altText = "";
522: for (int i = attrs.getLength() - 1; i >= 0; i--) {
523: String name = attrs.item(i).getNodeName();
524: String value = attrs.item(i).getNodeValue();
525: if (name.equals(ATTRIB_SRC)) {
526: // we found the src. now check if it refers to an
527: // external image.
528: // if not, we must get the correct location in the VFS
529: if (value.indexOf("://") <= 0) {
530: imagename = m_htmlImport.getAbsoluteUri(
531: value,
532: m_filename.substring(0, m_filename
533: .lastIndexOf("/") + 1));
534: value = m_htmlImport
535: .translateLink(imagename);
536: }
537: } else if (name.equals(ATTRIB_ALT)) {
538: altText = value;
539: }
540:
541: m_tempString.append(" ");
542: m_tempString.append(name);
543: m_tempString.append("=\"");
544: m_tempString.append(value);
545: m_tempString.append("\"");
546: }
547:
548: //store the alt tag of this image for later use
549: m_htmlImport.storeImageInfo(imagename, altText);
550:
551: m_tempString.append(">");
552: }
553: } else {
554:
555: // only do some output if we are in writing mode
556: if (m_write) {
557:
558: m_tempString.append("<");
559: m_tempString.append(nodeName);
560: NamedNodeMap attrs = node.getAttributes();
561: for (int i = attrs.getLength() - 1; i >= 0; i--) {
562: m_tempString.append(" "
563: + attrs.item(i).getNodeName() + "=" + "\"");
564: /* scan attribute values and replace subStrings */
565: m_tempString.append(attrs.item(i).getNodeValue()
566: + "\"");
567: }
568: m_tempString.append(">");
569: }
570: }
571: }
572:
573: /**
574: * Private method to transform text nodes.<p>
575: *
576: * @param node actual text node
577: */
578: private void transformTextNode(Node node) {
579:
580: // only do some output if we are in writing mode
581: if (m_write) {
582: String helpString = node.getNodeValue();
583: m_tempString.append(helpString);
584: }
585: }
586:
587: /**
588: * Writes meta tags as cms properties by analyzing the meta tags nodes.<p>
589: *
590: * @param node the meta tag node in html document
591: * @param properties the properties hashtable
592: */
593: private void writeMetaTagProperty(Node node, Hashtable properties) {
594:
595: NamedNodeMap attrs = node.getAttributes();
596: String metaName = "";
597: String metaContent = "";
598: // look through all attribs to find the name and content attributes
599: for (int i = attrs.getLength() - 1; i >= 0; i--) {
600: String name = attrs.item(i).getNodeName();
601: String value = attrs.item(i).getNodeValue();
602: if (name.equals(ATTRIB_NAME)) {
603: metaName = value;
604: } else if (name.equals(ATTRIB_CONTENT)) {
605: metaContent = value;
606: }
607: }
608: // check if we have valid entries for this <META> node, store them
609: // in the properties
610: if (metaName.length() > 0 && metaContent.length() > 0) {
611: properties.put(metaName, CmsStringUtil.substitute(
612: metaContent, "{subst}", "&#"));
613: }
614: }
615:
616: /**
617: * Sets the Property title by analyzing the title node.<p>
618: *
619: * @param node the title node in html document
620: * @param properties the properties hashtable
621: */
622: private void writeTitleProperty(Node node, Hashtable properties) {
623:
624: String title = "";
625: // the title string is stored in the first child node
626: NodeList children = node.getChildNodes();
627: if (children != null) {
628: Node titleNode = children.item(0);
629: if (titleNode != null) {
630: title = titleNode.getNodeValue();
631: }
632: }
633: // add the title property if we have one
634: if ((title != null) && (title.length() > 0)) {
635:
636: properties.put(CmsPropertyDefinition.PROPERTY_TITLE,
637: CmsStringUtil.substitute(title, "{subst}", "&#"));
638: // the title will be used as navtext if no other navtext is
639: // given
640: if (properties.get(CmsPropertyDefinition.PROPERTY_NAVTEXT) == null) {
641: properties.put(CmsPropertyDefinition.PROPERTY_NAVTEXT,
642: CmsStringUtil
643: .substitute(title, "{subst}", "&#"));
644: }
645: }
646:
647: }
648:
649: }
|