SnapReader.py :  » Development » SnapLogic » snaplogic » common » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Development » SnapLogic 
SnapLogic » snaplogic » common » SnapReader.py
# $SnapHashLicense:
# 
# SnapLogic - Open source data services
# 
# Copyright (C) 2009, SnapLogic, Inc.  All rights reserved.
# 
# See http://www.snaplogic.org for more information about
# the SnapLogic project. 
# 
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the LEGAL file
# at the top of the source tree.
# 
# "SnapLogic" is a trademark of SnapLogic, Inc.
# 
# 
# $

# $Id: SnapReader.py 7886 2009-06-15 23:17:56Z dmitri $

"""
This Module contains uniform interfaces for reading data from various input source.
The supported input sources are file, http, https, and ftp.

"""

__docformat__ = "epytext en"


import re, os
import codecs

import urllib2, urlparse
from urllib2 import build_opener,HTTPBasicAuthHandler,HTTPPasswordMgrWithDefaultRealm

from snaplogic.common.snap_exceptions import *


# This is a dictionary to "register" the schemes and their corresponding Reader object.
# To add new (user-provided) reader, the user can insert/update them to this dictionary.
SnapSchemeReader = {
                    'http':     'snaplogic.common.SnapReader.SnapHttpReader',
                    'https':    'snaplogic.common.SnapReader.SnapHttpReader',
                    'ftp':      'snaplogic.common.SnapReader.SnapFtpReader',
                    'file':     'snaplogic.common.SnapReader.SnapFileReader',
                   }
"""Dictionary of supported schemes/protocols and the corresponding Reader objects."""


class SnapReader(object):
    
    """
    This class provides interfaces for the users to access an input source.  The supported input sources are
    file, http, https, and ftp.  The class intefaces allow the users to create a reader object, open and close
    connection to the input source, and read data from the connection.
    
    """
    
    def __init__(self, input, username, password, encoding=None, proxy=None):
        """
        Initialize internal variables.
        
        @param input: The input source.
        @type input: str
        
        @param username: Credentials: username to read the input source
        @type username: str, None or empty string, if no credential needed.

        @param password: Credentials: password to read the input source
        @type password: str, None or empty string, if no credential needed.
        
        @param encoding: The encoding to expect from the source.
        @type encoding:  str
        
        @param proxy: Proxy server information that may be needed to read
            resources at external URIs
        @type proxy: str
        """
        
        # Save informaiton parsed from the url
        self._input = input

        idx = input.find('://')
        self._path = input[idx+3:]
        self._scheme = input[:idx]
        self._encoding = encoding
        
        if username is None:
            self._username = ""
        else:
            self._username = username
            
        if password is None:
            self._password = ""
        else:
            self._password = password
        
        self._proxy = proxy
        
        self._resp = None

        if encoding is not None:
            # We switch to encoding read
            self.read = self._read_encoding
            self.readline = self._readline_encoding
            self.readlines = self._readlines_encoding
        
    # Properties
    input = property(lambda self: self._input)
    path = property(lambda self: self._path)
    scheme = property(lambda self: self._scheme)
    handle = property(lambda self: self._resp)
    encoding = property(lambda self: self._encoding)
    
    
    @staticmethod
    def create(input, username=None, password=None, encoding=None, proxy=None):
        """
        Create and obtain a reader object for the given input source as specified in the input parameter.
        
        @param input: The input source URL in the format of 'scheme://input_path'.
            For example: http://site/input, ftp://site/input, file://input.
        @type input: str
        
        @param username: Credentials: username to read the input source
        @type username: str, None or empty string, if no credential needed.

        @param password: Credentials: password to read the input source
        @type password: str, None or empty string, if no credential needed.
        
        @param encoding: The encoding to expect from the source.
        @type encoding:  str 
        
        @param proxy: Proxy server information that may be needed to read
            resources at external URIs
        @type proxy: str

        @return: A reader object.
        @rtype: snaplogic.common.SnapReader
        
        @except SnapValueError: on unsupported scheme, invalid reader class.
        @except SnapObjTypeError: on invalid type of reader class
        
        """
        scheme = input[:input.find('://')]
        if scheme not in SnapSchemeReader.keys():
            raise SnapValueError('Unsupported scheme', scheme, input)

        # Load the reader object.
        rdrpath = SnapSchemeReader[scheme]
        paths = rdrpath.split('.')
        levels = len(paths)
        if levels == 1:
            raise SnapValueError('Invalid reader class for scheme %s' % scheme, rdrpath)
        
        modname = '.'.join(paths[0:(levels-1)])
        clsname = paths[levels-1]
        try:
            mod = sys.modules[modname]
            cls = getattr(mod, clsname)
        except Exception, e:
            raise SnapValueError('Invalid reader class for scheme %s' % scheme, rdrpath, str(e))
        
        if type(cls) != type(type):
            raise SnapObjTypeError('Invalid type of reader class for scheme %s' % scheme, rdrpath, type(cls))

        return cls(input, username, password, encoding, proxy)
    
    
    def open(self, callback=None, cbdata=None):
        """
        Open a connection to the input source.
        
        @param callback:  User provided callback that allows the user to create its own reader object.
        @type callback: function, or None.
        
        @return: A reader object.
        @rtype: snaplogic.common.SnapReader
        
        @except SnapObjTypeError: on callback value None
        
        """
        if callback:
            rdr = callback(self._input, self._username, self._password, cbdata)
            return rdr

        raise SnapObjTypeError('Abstract class error')
    
    
    def close(self, reader=None):
        """
        Close the connection to the input source.
        
        @param reader: The reader object that user created from the callback in open() method.
        @type reader: User-created reader object.
        
        @except SnapObjTypeError: on callback value None

        """
        raise SnapObjTypeError('Abstract class error')
    
def parse_url(config, url):
    """
    Parse URL and check if we should be using a proxy for this URL.
    
    @param config: CC config section
    @param url: URL
    @return: tuple (host, no_proxy) 
             where host is a string containing the hostname including port of the URL,
             and no_proxy is a boolean telling us whether proxy should be used for this host. 
    
    """
    
    # Parse the URL using urlparse.
    # This returns urlparse.ParseResult which is a subclass of tuple
    # and also has extra attributes such as hostname.
    parsed_url = urlparse.urlparse(url)
    # In the tuple returned "host" includes the port number,
    # but parsed_url.hostname doesn't include the port number.
    # We need a pure hostname to check against the no_proxy domain list.  
    (scheme, host, path, params, query, fragment) = parsed_url
    hostname = parsed_url.hostname
    
    # Check cc_no_proxy and if it matches the hostname bypass the proxy
    no_proxy = False
    if config and 'cc_no_proxy' in config:
        no_proxy_list = config['cc_no_proxy']
        for domain in no_proxy_list.split(','):
            domain = domain.strip()
            if domain and hostname.endswith(domain):
                no_proxy = True
                break
            
    return (host, no_proxy)
    
class SnapFtpReader(SnapReader):
    
    """
    Reader class for ftp input.
    """
    
    def __init__(self, input, username, password, encoding=None, proxy=None):
        """
        Initialize internal variables.
        
        @param input: The input source.
        @type input: str
        
        @param username: Credentials: username to read the input source
        @type username: str, None or empty string, if no credential needed.

        @param password: Credentials: password to read the input source
        @type password: str, None or empty string, if no credential needed.
        
        @param encoding: The encoding to expect from the source.
        @type encoding:  str 
        
        @param proxy: Proxy server information that may be needed to read
            resources at external URIs
        @type proxy: str

        """
        super(SnapFtpReader, self).__init__(input, username, password, encoding, proxy)
    
    
    def open(self, callback=None):
        """
        Open a connection to the input source.
        
        @param callback:  User provided callback that allows the user to create its own reader object.
        @type callback: function, or None.
        
        @return: A reader object.
        @rtype: snaplogic.common.SnapReader
        
        """
        if callback:
            return super(SnapFtpReader, self).open(callback = callback)        
        
        proxy_handler = None
        if self._proxy:
            proxy_server = self._proxy["cc_ftp_proxy"]
            proxy_port = self._proxy["cc_ftp_proxy_port"]
            if proxy_server != "":
                pr = proxy_server + ":" + proxy_port
                proxy_handler = urllib2.ProxyHandler({'ftp': pr})

        # In case specific proxy information was not set up in the CC section of the
        # config file, see if FTP_PROXY environment variable is set and use it.
        
        # TODO: Note that this "if" block may be unnecessary.
        # Python urllib2 reads environment variable ftp_proxy (which appears to be case-insensitive).
        # This behavior is hardwired into urllib2, so if this variable is assigned, no matter what the code does, 
        # urllib2 will use the proxy specified by the variable.
        if not proxy_handler:
            try:
                pr = os.environ['FTP_PROXY']
                proxy_handler = urllib2.ProxyHandler({'ftp': pr})
            except KeyError:
                pass
            
        # Parse the URL and check if proxy should be used or not:
        # we have a config parameter cc_no_proxy to specify if there are domains
        # for which we shouldn't be using a proxy.  
        (host, no_proxy) = parse_url(self._proxy, self._input)
        if no_proxy:
            # If no_proxy was returned for the URL set proxy handlers to None.
            proxy_handler = None
        
        # Ignore the credential, even if it is provided.
        # urllib2.FTPHandler login with empty user name and password (works for anonymous login as well).
        if not proxy_handler:
            opener = build_opener(urllib2.FTPHandler)
        else:
            opener = build_opener(proxy_handler, urllib2.FTPHandler)
        self._resp = opener.open(self._input)
        return self
    
    def _read_encoding(self):
        """
        The default read() method is replaced with this method when an encoding is specified.
        
        @return: The content.
        @rtype: unicode
        
        """
        
        return unicode(self._resp.read(), self.encoding)
    
    def read(self):
        """
        Read the content from the connection.
        
        @return: The content.
        @rtype: str/unicode
        
        """
        return self._resp.read()

    def _readline_encoding(self):
        """
        The default readline() method is replaced with this method when an encoding is specified.
        
        @return: The next line of the content.
        @rtype: unicode
        
        """
        
        return unicode(self._resp.readline(), self.encoding)
    
    def readline(self):
        """
        Read the next line of the content from the connection.

        @return: The next line of the content.
        @rtype: str/unicode
        
        """
        return self._resp.readline()
        
    def _readlines_encoding(self):
        """
        The default readlines() method is replaced with this method when an encoding is specified.
        
        @return: The next line of the content.
        @rtype: unicode
        
        """
        return [ unicode(l, self.encoding) for l in self._resp.readlines()]
    
    def readlines(self):
        """
        Read the content in lines from the connection.

        @return: The lines of the content.
        @rtype: list
        
        """
        return self._resp.readlines()
    
    
    def close(self):
        """
        Close the connection.
        
        """
        if self._resp:
            self._resp.close()
            self._resp = None
    
    
class SnapFileReader(SnapReader):
    
    """
    Reader class for file input.
    """
    
    def __init__(self, input, username, password, encoding=None, proxy=None):
        """
        Initialize internal variables.
        
        @param input: The input source.
        @type input: str
        
        @param username: Credentials: username to read the input source
        @type username: str, None or empty string, if no credential needed.

        @param password: Credentials: password to read the input source
        @type password: str, None or empty string, if no credential needed.
        
        @param encoding: The encoding to expect from the source.
        @type encoding:  str 

        @param proxy: Proxy server information that may be needed to read
            resources at external URIs
        @type proxy: str
        
        """
        super(SnapFileReader, self).__init__(input, username, password, encoding, proxy)
    
    def open(self, callback=None):
        """
        Open a connection to the input source.
        
        @param callback:  User provided callback that allows the user to create its own reader object.
        @type callback: function, or None.
        
        @return: A reader object.
        @rtype: snaplogic.common.SnapReader
        
        """
        if callback:
            return super(SnapFileReader, self).open(callback = callback)        
        
        opener = build_opener(urllib2.FileHandler)
        self._resp = opener.open(self._input)
        return self
    
    def read(self):
        """
        Read the content from the connection.
        
        @return: The content.
        @rtype: str
        
        """
        return self._resp.read()

    def _read_encoding(self):
        """
        The default read() method is replaced with this method when an 
        encoding is specified.
        
        @return: The content.
        @rtype: unicode
        
        """
        
        return unicode(self._resp.read(), self.encoding)
    
    def _readline_encoding(self):
        """
        The default readline() method is replaced with this method when an
         encoding is specified.
        
        @return: The next line of the content.
        @rtype: unicode
        
        """
        
        return unicode(self._resp.readline(), self.encoding)
    
    def readline(self):
        """
        Read the next line of the content from the connection.

        @return: The next line of the content.
        @rtype: str
        
        """
        return self._resp.readline()
    
    def _readlines_encoding(self):
        """
        The default readlines() method is replaced with this method when an 
        encoding is specified.
        
        @return: The next line of the content.
        @rtype: unicode
        
        """
        return [ unicode(l, self.encoding) for l in self._resp.readlines()]
    
    def readlines(self):
        """
        Read the content in lines from the connection.

        @return: The lines of the content.
        @rtype: list
        
        """
        return self._resp.readlines()
    
    def close(self):
        """
        Close the connection.
        
        """
        if self._resp:
            self._resp.close()
            self._resp = None
    
class SnapHttpReader(SnapReader):
    
    """
    Reader class for http input.
    """
    
    def __init__(self, input, username, password, encoding=None, proxy=None):
        """
        Initialize internal variables.
        
        @param input: The input source.
        @type input: str
        
        @param username: Credentials: username to read the input source
        @type username: str, None or empty string, if no credential needed.

        @param password: Credentials: password to read the input source
        @type password: str, None or empty string, if no credential needed.
        
        @param encoding: The encoding to expect from the source.
        @type encoding:  str

        @param proxy: Proxy server information that may be needed to read
            resources at external URIs
        @type proxy: str
        
        """
        super(SnapHttpReader, self).__init__(input, username, password, encoding, proxy)

    
    def open(self, callback=None):
        """
        Open a reader object for HTTP access.
        
        @param callback:  User provided callback that allows the user to create its own reader object.
        @type callback: function, or None.
        
        @return: A reader object.
        @rtype: snaplogic.common.SnapReader

        """
        if callback:
            return super(SnapHttpReader, self).open(callback = callback)
    
        # Optional proxy handler setup for accessing external URIs 
        proxy_handler = None
        proxy_auth_handler = None
        if self._proxy:
            proxy_server = self._proxy["cc_http_proxy"]
            proxy_port = self._proxy["cc_http_proxy_port"]
            
            if proxy_server != "":
                pr = proxy_server + ":" + proxy_port
                # NB: support for https over proxy requires patching Python 2.5. 
                proxy_handler = urllib2.ProxyHandler({'http': pr})
                
                # NB: Authenticating proxy is untested.
                # There is also potentially Digest authentication to add later.
                proxy_auth_realm = self._proxy["cc_http_proxy_realm"]
                proxy_auth_host = self._proxy["cc_http_proxy_host"]
                proxy_auth_username = self._proxy["cc_http_proxy_username"]
                proxy_auth_password = self._proxy["cc_http_proxy_password"]
                if proxy_auth_username != "":
                    proxy_auth_handler = urllib2.ProxyBasicAuthHandler()
                    proxy_auth_handler.add_password(proxy_auth_realm, proxy_auth_host, proxy_auth_username, proxy_auth_password)

        # In case specific proxy information was not set up in the CC section of the
        # config file, see if HTTP_PROXY environment variable is set and use it.
        # Note: urllib2 provides default, transparent proxy handler for this (non-authenticating
        # proxy only - to use authenticating proxy, config options like cc_http_proxy_realm have
        # to be used for the codepath above) case, however we set it explicitly here for symmetry
        # and to exercise the below code path in QA system, since there's no way to automatically
        # test config setup yet due to bug 1353.
        
        # TODO: Note that this "if" block may be unnecessary.
        # Python urllib2 reads environment variable http_proxy (which appears to be case-insensitive).
        # This behavior is hardwired into urllib2, so if this variable is assigned, no matter what the code does, 
        # urllib2 will use the proxy specified by the variable.
        if not proxy_handler:
            try:
                pr = os.environ['HTTP_PROXY']
                proxy_handler = urllib2.ProxyHandler({'http': pr})
            except KeyError:
                pass

        # Parse the URL and check if proxy should be used or not:
        # we have a config parameter cc_no_proxy to specify if there are domains
        # for which we shouldn't be using a proxy.  
        (host, no_proxy) = parse_url(self._proxy, self._input)
        if no_proxy:
            # If no_proxy was returned for the URL set proxy handlers to None.
            proxy_handler = None
            proxy_auth_handler = None

        if self._username or self._password:
            # Authenticated connection

            # Use default realm
            mgr = HTTPPasswordMgrWithDefaultRealm()
            mgr.add_password(None, host, self._username, self._password)

            # Create an OpenerDirector with support for Basic HTTP Authentication...
            if not proxy_handler:
                opener = build_opener(HTTPBasicAuthHandler(mgr))
            elif not proxy_auth_handler:
                opener = build_opener(proxy_handler, HTTPBasicAuthHandler(mgr))
            else:
                opener = build_opener(proxy_handler, proxy_auth_handler, HTTPBasicAuthHandler(mgr))
            self._resp = opener.open(self._input)
        else:
            # Plain connection without authentication
            if not proxy_handler:
                opener = build_opener()
            elif not proxy_auth_handler:
                opener = build_opener(proxy_handler)
            else:
                opener = build_opener(proxy_handler, proxy_auth_handler)
            self._resp = opener.open(self._input)
            
        return self
    
    def _read_encoding(self):
        """
        The default read() method is replaced with this method when an 
        encoding is specified.
        
        @return: The content.
        @rtype: unicode
        
        """
        
        return unicode(self._resp.read(), self.encoding)
    
    def read(self):
        """
        Read the content from the connection.
        
        @return: The content.
        @rtype: str/unicode
        
        """
        return self._resp.read()

    def _readline_encoding(self):
        """
        The default read() method is replaced with this method when an
         encoding is specified.
        
        @return: The next line of the content.
        @rtype: unicode
        
        """
        
        return unicode(self._resp.readline(), self.encoding)
    
    def readline(self):
        """
        Read the next line of the content from the connection.

        @return: The next line of the content.
        @rtype: str
        
        """
        return self._resp.readline()
        
        
    def _readlines_encoding(self):
        """
        The default read() method is replaced with this method when an 
        encoding is specified.
        
        @return: The next line of the content.
        @rtype: unicode
        
        """
        return [ unicode(l, self.encoding) for l in self._resp.readlines()]
    
    def readlines(self):
        """
        Read the content in lines from the connection.

        @return: The lines of the content.
        @rtype: list
        
        """
        return self._resp.readlines()
    
    
    def close(self):
        """
        Close the connection.
        
        """
        if self._resp:
            self._resp.close()
            self._resp = None
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.