tidy.py :  » Web-Server » CherryPy » CherryPy-3.1.2 » cherrypy » lib » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Web Server » CherryPy 
CherryPy » CherryPy 3.1.2 » cherrypy » lib » tidy.py
"""Functions to run cherrypy.response through Tidy or NSGML."""

import cgi
import os
import StringIO
import traceback

import cherrypy
    
def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None,
         indent=False, wrap=False, warnings=True):
    """Run cherrypy.response through Tidy.
    
    If either 'indent' or 'wrap' are specified, then response.body will be
    set to the output of tidy. Otherwise, only errors (including warnings,
    if warnings is True) will change the body.
    
    Note that we use the standalone Tidy tool rather than the python
    mxTidy module. This is because this module does not seem to be
    stable and it crashes on some HTML pages (which means that the
    server would also crash)
    """
    response = cherrypy.response
    
    # the tidy tool, by its very nature it's not generator friendly, 
    # so we just collapse the body and work with it.
    orig_body = response.collapse_body()
    
    fct = response.headers.get('Content-Type', '')
    ct = fct.split(';')[0]
    encoding = ''
    i = fct.find('charset=')
    if i != -1:
        encoding = fct[i + 8:]
    
    if ct == 'text/html':
        page_file = os.path.join(temp_dir, 'page.html')
        open(page_file, 'wb').write(orig_body)
        
        out_file = os.path.join(temp_dir, 'tidy.out')
        err_file = os.path.join(temp_dir, 'tidy.err')
        tidy_enc = encoding.replace('-', '')
        if tidy_enc:
            tidy_enc = '-' + tidy_enc
        
        strict_xml = ("", " -xml")[bool(strict_xml)]
        
        if indent:
            indent = ' -indent'
        else:
            indent = ''
        
        if wrap is False:
            wrap = ''
        else:
            try:
                wrap = ' -wrap %d' % int(tidyWrap)
            except:
                wrap = ''
        
        result = os.system('"%s" %s%s%s%s -f %s -o %s %s' %
                           (tidy_path, tidy_enc, strict_xml, indent, wrap,
                            err_file, out_file, page_file))
        use_output = bool(indent or wrap) and not result
        if use_output:
            output = open(out_file, 'rb').read()
        
        new_errs = []
        for err in open(err_file, 'rb').read().splitlines():
            if (err.find('Error') != -1 or
                (warnings and err.find('Warning') != -1)):
                ignore = 0
                for err_ign in errors_to_ignore or []:
                    if err.find(err_ign) != -1:
                        ignore = 1
                        break
                if not ignore:
                    new_errs.append(err)
        
        if new_errs:
            response.body = wrong_content('<br />'.join(new_errs), orig_body)
            if response.headers.has_key("Content-Length"):
                # Delete Content-Length header so finalize() recalcs it.
                del response.headers["Content-Length"]
            return
        elif strict_xml:
            # The HTML is OK, but is it valid XML?
            # Use elementtree to parse XML
            from elementtree.ElementTree import parse
            tag_list = ['nbsp', 'quot']
            for tag in tag_list:
                orig_body = orig_body.replace('&' + tag + ';', tag.upper())
            
            if encoding:
                enctag = '<?xml version="1.0" encoding="%s"?>' % encoding
                orig_body = enctag + orig_body
            
            f = StringIO.StringIO(orig_body)
            try:
                tree = parse(f)
            except:
                # Wrong XML
                body_file = StringIO.StringIO()
                traceback.print_exc(file = body_file)
                body_file = '<br />'.join(body_file.getvalue())
                response.body = wrong_content(body_file, orig_body, "XML")
                if response.headers.has_key("Content-Length"):
                    # Delete Content-Length header so finalize() recalcs it.
                    del response.headers["Content-Length"]
                return
        
        if use_output:
            response.body = [output]
            if response.headers.has_key("Content-Length"):
                # Delete Content-Length header so finalize() recalcs it.
                del response.headers["Content-Length"]

def html_space(text):
    """Escape text, replacing space with nbsp and tab with 4 nbsp's."""
    return cgi.escape(text).replace('\t', '    ').replace(' ', '&nbsp;')

def html_break(text):
    """Escape text, replacing newline with HTML br element."""
    return cgi.escape(text).replace('\n', '<br />')

def wrong_content(header, body, content_type="HTML"):
    output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))]
    for i, line in enumerate(body.splitlines()):
        output.append("%03d - %s" % (i + 1, html_space(line)))
    return "<br />".join(output)


def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None):
    response = cherrypy.response
    
    # the tidy tool, by its very nature it's not generator friendly, 
    # so we just collect the body and work with it.
    orig_body = response.collapse_body()
    
    fct = response.headers.get('Content-Type', '')
    ct = fct.split(';')[0]
    encoding = ''
    i = fct.find('charset=')
    if i != -1:
        encoding = fct[i + 8:]
    if ct == 'text/html':
        # Remove bits of Javascript (nsgmls doesn't seem to handle
        #   them correctly (for instance, if <a appears in your
        #   Javascript code nsgmls complains about it)
        while True:
            i = orig_body.find('<script')
            if i == -1:
                break
            j = orig_body.find('</script>', i)
            if j == -1:
                break
            orig_body = orig_body[:i] + orig_body[j+9:]

        page_file = os.path.join(temp_dir, 'page.html')
        open(page_file, 'wb').write(orig_body)
        
        err_file = os.path.join(temp_dir, 'nsgmls.err')
        command = ('%s -c%s -f%s -s -E10 %s' %
                   (nsgmls_path, catalog_path, err_file, page_file))
        command = command.replace('\\', '/')
        os.system(command)
        errs = open(err_file, 'rb').read()
        
        new_errs = []
        for err in errs.splitlines():
            ignore = False
            for err_ign in errors_to_ignore or []:
                if err.find(err_ign) != -1:
                    ignore = True
                    break
            if not ignore:
                new_errs.append(err)
        
        if new_errs:
            response.body = wrong_content('<br />'.join(new_errs), orig_body)
            if response.headers.has_key("Content-Length"):
                # Delete Content-Length header so finalize() recalcs it.
                del response.headers["Content-Length"]

www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.