CSVParser.py :  » Web-Frameworks » Webware » Webware-1.0.2 » MiscUtils » Python Open Source

Home
Python Open Source
1.3.1.2 Python
2.Ajax
3.Aspect Oriented
4.Blog
5.Build
6.Business Application
7.Chart Report
8.Content Management Systems
9.Cryptographic
10.Database
11.Development
12.Editor
13.Email
14.ERP
15.Game 2D 3D
16.GIS
17.GUI
18.IDE
19.Installer
20.IRC
21.Issue Tracker
22.Language Interface
23.Log
24.Math
25.Media Sound Audio
26.Mobile
27.Network
28.Parser
29.PDF
30.Project Management
31.RSS
32.Search
33.Security
34.Template Engines
35.Test
36.UML
37.USB Serial
38.Web Frameworks
39.Web Server
40.Web Services
41.Web Unit
42.Wiki
43.Windows
44.XML
Python Open Source » Web Frameworks » Webware 
Webware » Webware 1.0.2 » MiscUtils » CSVParser.py
"""CSVParser.py

A parser for CSV files.

"""

import types

# The states of the parser
StartRecord        = 0
StartField         = 1
InField            = 2
QuoteInField       = 3
InQuotedField      = 4
QuoteInQuotedField = 5
EndQuotedField     = 6

# State handlers can return Finished to terminate parsing early
Finished           = 10


class ParseError(Exception):
  pass


class CSVParser:
  """Parser for CSV files.

  Parses CSV files including all subtleties such as:
    * commas in fields
    * double quotes in fields
    * embedded newlines in fields
      - Examples of programs that produce such beasts include
        MySQL and Excel

  For a higher-level, friendlier CSV class with many conveniences,
  see DataTable (which uses this class for its parsing).

  Example:
    records = []
    parse = CSVParser().parse
    for line in lines:
      results = parse(line)
      if results is not None:
        records.append(results)

  CREDIT

  The algorithm was taken directly from the open source Python
  C-extension, csv:
    http://www.object-craft.com.au/projects/csv/

  It would be nice to use the csv module when present, since it is
  substantially faster. Before that can be done, it needs to support
  allowComments and stripWhitespace, and pass the TestCSVParser.py
  test suite.

  """

  def __init__(self, allowComments=1, stripWhitespace=1, fieldSep=',',
      autoReset=1, doubleQuote=1):
    """Create a new CSV parser.

    allowComments: If true (the default), then comment lines using
      the Python comment marker are allowed.
    stripWhitespace: If true (the default), then left and right whitespace
      is stripped off from all fields.
    fieldSep: Defines the field separator string (a comma by default).
    autoReset: If true (the default), recover from errors automatically.
    doubleQuote: If true (the default), assume quotes in fields are
      escaped by appearing doubled.

    """
    # settings
    self._allowComments = allowComments
    self._stripWhitespace = stripWhitespace
    self._doubleQuote = doubleQuote
    self._fieldSep = fieldSep
    self._autoReset = autoReset

    # Other
    self._state = StartRecord
    self._fields = []
    self._hadParseError = 0
    self._field = [] # a list of chars for the cur field
    self.addChar = self._field.append

    # The handlers for the various states
    self._handlers = [
      self.startRecord,
      self.startField,
      self.inField,
      self.quoteInField,
      self.inQuotedField,
      self.quoteInQuotedField,
      self.endQuotedField,
    ]


  ## Parse ##

  def parse(self, line):
    """Parse a single line and return a list of string fields.

    Returns None if the CSV record contains embedded newlines and
    the record is not yet complete.

    """
    if self._autoReset and self._hadParseError:
      self.reset()
    handlers = self._handlers

    i = 0
    lineLen = len(line)
    while i < lineLen:
      c = line[i]
      if c == '\r':
        i += 1
        if i == lineLen:
          break # Mac end of line
        c = line[i]
        if c == '\n':
          i += 1
          if i == lineLen:
            break # Win end of line

        self._hadParseError = 1
        raise ParseError('Newline inside string')

      elif c == '\n':
        i += 1
        if i == lineLen:
          break # unix end of line

        self._hadParseError = 1
        raise ParseError('Newline inside string')

      else:
        if handlers[self._state](c) == Finished:
          break # process a character

      i += 1

    handlers[self._state]('\0') # signal the end of the input

    if self._state == StartRecord:
      fields = self._fields
      self._fields = []
      if self._stripWhitespace:
        fields = [field.strip() for field in fields]
      return fields
    else:
      return None # indicates multi-line record; e.g. not finished


  ## Reset ##

  def reset(self):
    """Reset the parser.

    Resets the parser to a fresh state in order to recover from
    exceptions. But if autoReset is true (the default), this is
    done automatically.

    """
    self._fields = []
    self._state = StartRecord
    self._hadParseError = 0


  ## State Handlers ##

  def startRecord(self, c):
    if c != '\0': # not empty line
      if c == '#' and self._allowComments:
        return Finished
      else:
        self._state = StartField
        self.startField(c)

  def startField(self, c):
    if c == '"':
      self._state = InQuotedField # start quoted field
    elif c == self._fieldSep:
      self.saveField() # save empty field
    elif c == ' ' and self._stripWhitespace:
      pass # skip over preceding whitespace
    elif c == '\0':
      self.saveField() # save empty field
      self._state = StartRecord
    else:
      self.addChar(c) # begin new unquoted field
      self._state = InField

  def inField(self, c):
    # in unquoted field
    if c == self._fieldSep:
      self.saveField()
      self._state = StartField
    elif c == '\0':
      self.saveField() # end of line
      self._state = StartRecord
    elif c == '"' and self._doubleQuote:
      self._state = QuoteInField
    else:
      self.addChar(c) # normal character

  def quoteInField(self, c):
    self.addChar('"')
    if c == '"':
      self._state = InField # save "" as "
    elif c == '\0':
      self.saveField() # end of line
      self._state = StartRecord
    elif c == self._fieldSep:
      self.saveField()
      self._state = StartField
    else:
      self.addChar(c) # normal character
      self._state = InField

  def inQuotedField(self, c):
    if c == '"':
      if self._doubleQuote:
        self._state = QuoteInQuotedField
      else:
        self.saveField() # end of field
        self._state = EndQuotedField
    elif c == '\0':
      self.addChar('\n') # end of line
    else:
      self.addChar(c) # normal character

  def quoteInQuotedField(self, c):
    if c == '"':
      self.addChar('"') # save "" as "
      self._state = InQuotedField
    elif c == self._fieldSep:
      self.saveField()
      self._state = StartField
    elif c == ' ' and self._stripWhitespace:
      pass # skip it
    elif c == '\0':
      self.saveField() # end of line
      self._state = StartRecord
    else:
      self._hadParseError = 1 # illegal
      raise ParseError, '%s expected after "' % self._fieldSep

  def endQuotedField(self, c):
    if c == self._fieldSep: # seen closing " on quoted field
      self._state = StartField # wait for new field
    elif c == '\0':
      self._state = StartRecord # end of line
    else:
      self._hadParseError = 1
      raise ParseError, '%s expected after "' % self._fieldSep

  def saveField(self):
    self._fields.append(''.join(self._field))
    self._field = []
    self.addChar = self._field.append


# Call the global function parse() if you like the default settings of the CSVParser
_parser = CSVParser()
parse = _parser.parse


def joinCSVFields(fields):
  """Return a CSV record (e.g. a string) from a sequence of fields.

  Fields containing commands (,) or double quotes (") are quoted and
  double quotes are escaped (""). The terminating newline is NOT included.

  """
  newFields = []
  for field in fields:
    assert type(field) is types.StringType
    if field.find('"') != -1:
      newField = '"' + field.replace('"', '""') + '"'
    elif field.find(',') != -1:
      newField = '"' + field + '"'
    else:
      newField = field
    newFields.append(newField)
  return ','.join(newFields)
www.java2java.com | Contact Us
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.