k_encoding_makecharsets.py : » Web-Frameworks » Karrigell » Karrigell-3.1 » karrigell » core » k_encodings » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » Web Frameworks » Karrigell
Karrigell » Karrigell 3.1 » karrigell » core » k_encodings » k_encoding_makecharsets.py
#!/bin/env python
#  -*- coding: iso-8859-15 -*-
# File: k_encoding_makecharsets.py
"""
From IANA character sets definitions, to have data in a Python usable form.

This module build k_encoding_charsets containing data about characters sets, 
Python encodings and preffered MIME types.

First, we download charsets description data from  official source:
    http://www.iana.org/assignments/character-sets

Then, we build a list of CharacterSet objects.
Each CharacterSet has four members:
    -*name*     the IANA charset name
    -*mib*      the IANA MIBenum (may be None)
    -*aliases*  a list of alias names for the charset (may be empty list)
    -*mime*     the preffered MIME name  if specified (may be None)


This module called with -g option download the character-sets file from IANA
into the current working directory.

And called with -p option, it prints charsets list.
"""

#============================================================================
__all__ = ["make"]
from os import path
import urllib2
import sys
import pprint
import encodings
from encodings.aliases import aliases
from sets import Set

#============================================================================
DEBUG = 0
# The dictionnary of  CharacterSet (indexed by IANA name).
charsets = {}
# List of CharacterSets (indexed by Python encodings).
pyenc_charsets = {}
# Correspondance of aliases and Python encoding.
aliases2python = {}
# Correspondance of aliases and MIME preffered types.
# Note: only aliases having a preffered type are stored.
aliases2mime = {}

#============================================================================
# Small tool function - Python aliases are lowercase... but normalize 
# function dont make names lowercase.
def normenc(e) :
    return encodings.normalize_encoding(e).lower()

#============================================================================
class CharacterSet (object) :
    def __init__(self,name,mib,aliases,mime,pyenc) :
        self.name = name
        self.mib = mib
        self.aliases = aliases
        self.mime = mime
        self.pyenc = pyenc
        self.normnames = [ normenc(n) for n in [name]+aliases]
        
        charsets[name] = self
        if pyenc :
            pyenc_charsets.setdefault(pyenc,[]).append(self)
    def __repr__(self) :
        return "<CharacterSet(name=%r,mib=%r,aliases=%r,mime=%r,pyenc=%r)>"%(self.name,self.mib,self.aliases,self.mime,self.pyenc)


#============================================================================
def upload_iana_file() :
    """Get IANA character sets definitions from its source site.
    
    Return a 
    
    @return: list of lines to iterate on.
    @rtype: [str,str,...]
    """
    f = urllib2.urlopen("http://www.iana.org/assignments/character-sets")
    data = f.read()
    f.close()
    return data.split("\n")

#============================================================================
def analyze_charset_data(iana_data) :
    """Extract data from the IANA character sets definitions.
    
    @param iana_data: definitions of characters.
    @type iana_data: line-iterable (file, list of string...)
    """
    global charsets
    
    # Note: All character sets definitions begin by Name: and finish by a
    # white line.
    NAMEKEYWORD     = "Name: "
    MIBKEYWORD      = "MIBenum: "
    ALIASKEYWORD    = "Alias: "

    in_setdef = False
    name = None
    mib = None
    aliases = []
    mime = None
    pyenc = None

    for line in iana_data :
        line = line.strip()
        if not in_setdef :
            if line.startswith(NAMEKEYWORD) :   # Start of  an entry.
                # The name line can have extra characters:
                # Name:  xxxxx                            [ref1,ref2]
                # Name:  xxxxx (preffered MIME name)      [ref1,ref2]
                name = line.split()[1]
                if "preferred MIME name" in line : mime = name
                in_setdef = True
                if DEBUG : print "Found",name
            continue
        if not line.strip() :
            if not name : continue
            # End of an entry.
            # Search a Python correspondance from one of name/alias.
            for n in [name]+aliases :
                pyenc = encoding_aliases.get(normenc(n),None)
                if pyenc : break
            if DEBUG : print "End",name
            # Make object (it auto-register in dicts/lists).
            CharacterSet(name,mib,aliases,mime,pyenc)
            # Reset...
            name = None
            mib = None
            aliases = []
            mime = None
            pyenc = None
            in_setdef = False
            continue
        # Keep data in set definition.
        if line.startswith(MIBKEYWORD) :
            mib = int(line[len(MIBKEYWORD):])
        elif line.startswith(ALIASKEYWORD) :
            alias = line[len(ALIASKEYWORD):]
            if alias == "None" : continue
            # ... yes, there are lines with "Alias: None" ...
            if "preferred MIME name" in alias :
                alias = alias.split()[0]
                mime = alias
            aliases.append(alias.strip())

#============================================================================
def build_dicts() :
    """Build dictionnaries usable for encoding identification.
    """
    global charsets,aliases2python,aliases2mime
    
    # Make some missing links.
    charsets['UTF-8'].mime = 'UTF-8'
    
    # We build aliases2python from encoding.aliases.aliases dictionary - maybe
    # incomplete (if an encoding has no alias, is it in this dictionnary...).
    for alias in encoding_aliases :
        pyenc = encoding_aliases[alias]
        # Python use encoding for more than character sets conversions, 
        # so avoid storing these things.
        if pyenc in ['base64_codec','bz2_codec','hex_codec',
                            'idna','quopri_codec','rot_13','string_escape',
                            'unicode_escape','uu_codec','zlib_codec'] :
            continue
        # Fill aliases2python with Python known aliases.
        aliases2python[normenc(alias)] = pyenc
        aliases2python[normenc(pyenc)] = pyenc
    
    # Continue to fill aliases2python using IANA charsets.
    for clist in pyenc_charsets.itervalues() :
        for c in clist :
            for normname in c.normnames :
                if normname not in aliases2python :
                    aliases2python[normname] = c.pyenc
    
    # Now, set aliases2mime, but only for encodings supported by Python
    # where we have a corresponding MIME information.
    for clist in pyenc_charsets.values() :
        for c in clist :
            if not c.mime : continue
            for n in [c.name]+c.aliases :
                    aliases2mime[normenc(n)] = c.mime


#============================================================================
def make_k_encoding_charsets() :
    """
    """
    global aliases2python,aliases2mime
    fname = path.join([path.dirname(path.abspath(__file__)),"k_encoding_charsets.py"])
    f = open(fname,"wU")
    f.write("#!/bin/env python\n")
    f.write("# -*- coding: ascii -*-\n")
    f.write("# file: k_encoding_charsets - automatically generated!!!\n")
    f.write("encoding2python_map = ")
    pprint.pprint(aliases2python,f,indent=4)
    #f.write(repr(aliases2python))
    f.write("\n")
    f.write("encoding2mime_map = ")
    pprint.pprint(aliases2mime,f,indent=4)
    #f.write(repr(aliases2mime))
    f.write("\n")

#============================================================================
def make() :
    """Do our job: build k_encoding_charsets module.
    """
    global charsets
    
    if not charsets :
        iana_data = upload_iana_file()
        analyze_charset_data(iana_data)
    build_dicts()
    make_k_encoding_charsets()

#============================================================================
if __name__ == "__main__" :
    import getopt

    if len(sys.argv)<2 or '-h' in sys.argv or '--help' in sys.argv  :
        print "Usage: python k_ianacharset [-g] [-p] [-m]"
        print "  -b make k_encoding_charsets.py module" 
        print "  -g store character-sets file (from http://www.iana.org/assignments/)."
        print "  -p print identified charsets"
        print
        sys.exit()

    
    print "Loading http://www.iana.org/assignments/character-sets"
    iana_data = upload_iana_file()
    analyze_charset_data(iana_data)
    
    if '-g' in sys.argv :
        print "Writing",path.abspath("character-sets")
        f = open("character-sets","wU")
        for l in iana_data : f.write(l+"\n")
        f.close()

    if '-p' in sys.argv :
        p = pprint.PrettyPrinter()
        print "Charsets from IANA:"
        p.pprint(charsets)

    if '-m' in sys.argv :
        make()
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.