t_parse.py : » PDF » ReportLab » ReportLab_2_4 » tools » docco » Python Open Source

1.	3.1.2 Python
2.	Ajax
3.	Aspect Oriented
4.	Blog
5.	Build
6.	Business Application
7.	Chart Report
8.	Content Management Systems
9.	Cryptographic
10.	Database
11.	Development
12.	Editor
13.	Email
14.	ERP
15.	Game 2D 3D
16.	GIS
17.	GUI
18.	IDE
19.	Installer
20.	IRC
21.	Issue Tracker
22.	Language Interface
23.	Log
24.	Math
25.	Media Sound Audio
26.	Mobile
27.	Network
28.	Parser
29.	PDF
30.	Project Management
31.	RSS
32.	Search
33.	Security
34.	Template Engines
35.	Test
36.	UML
37.	USB Serial
38.	Web Frameworks
39.	Web Server
40.	Web Services
41.	Web Unit
42.	Wiki
43.	Windows
44.	XML
Python Open Source » PDF » ReportLab
ReportLab » ReportLab_2_4 » tools » docco » t_parse.py
#Copyright ReportLab Europe Ltd. 2000-2004
#see license.txt for license details
#history http://www.reportlab.co.uk/cgi-bin/viewcvs.cgi/public/reportlab/trunk/reportlab/tools/docco/t_parse.py
"""
Template parsing module inspired by REXX (with thanks to Donn Cave for discussion).

Template initialization has the form:
   T = Template(template_string, wild_card_marker, single_char_marker,
             x = regex_x, y = regex_y, ...)
Parsing has the form
   ([match1, match2, ..., matchn], lastindex) = T.PARSE(string)

Only the first argument is mandatory.

The resultant object efficiently parses strings that match the template_string,
giving a list of substrings that correspond to each "directive" of the template.

Template directives:

  Wildcard:
    The template may be initialized with a wildcard that matches any string
    up to the string matching the next directive (which may not be a wild
    card or single character marker) or the next literal sequence of characters
    of the template.  The character that represents a wildcard is specified
    by the wild_card_marker parameter, which has no default.

    For example, using X as the wildcard:


    >>> T = Template("prefixXinteriorX", "X")
    >>> T.PARSE("prefix this is before interior and this is after")
    ([' this is before ', ' and this is after'], 47)
    >>> T = Template("<X>X<X>", "X")
    >>> T.PARSE('<A HREF="index.html">go to index</A>')
    (['A HREF="index.html"', 'go to index', '/A'], 36)

    Obviously the character used to represent the wildcard must be distinct
    from the characters used to represent literals or other directives.

  Fixed length character sequences:
    The template may have a marker character which indicates a fixed
    length field.  All adjacent instances of this marker will be matched
    by a substring of the same length in the parsed string.  For example:

      >>> T = Template("NNN-NN-NNNN", single_char_marker="N")
      >>> T.PARSE("1-2-34-5-12")
      (['1-2', '34', '5-12'], 11)
      >>> T.PARSE("111-22-3333")
      (['111', '22', '3333'], 11)
      >>> T.PARSE("1111-22-3333")
      ValueError: literal not found at (3, '-')

    A template may have multiple fixed length markers, which allows fixed
    length fields to be adjacent, but recognized separately.  For example:

      >>> T = Template("MMDDYYX", "X", "MDY")
      >>> T.PARSE("112489 Somebody's birthday!")
      (['11', '24', '89', " Somebody's birthday!"], 27)

  Regular expression markers:
    The template may have markers associated with regular expressions.
    the regular expressions may be either string represenations of compiled.
    For example:
      >>> T = Template("v: s i", v=id, s=str, i=int)
      >>> T.PARSE("this_is_an_identifier: 'a string' 12344")
      (['this_is_an_identifier', "'a string'", '12344'], 39)
      >>>

    Here id, str, and int are regular expression conveniences provided by
    this module.

  Directive markers may be mixed and matched, except that wildcards cannot precede
  wildcards or single character markers.
  Example:
>>> T = Template("ssnum: NNN-NN-NNNN, fn=X, ln=X, age=I, quote=Q", "X", "N", I=int, Q=str)
>>> T.PARSE("ssnum: 123-45-6789, fn=Aaron, ln=Watters, age=13, quote='do be do be do'")
(['123', '45', '6789', 'Aaron', 'Watters', '13', "'do be do be do'"], 72)
>>>

"""

import re, string
from types import StringType
from string import find

#
# template parsing
#
# EG: T = Template("(NNN)NNN-NNNN X X", "X", "N")
#     ([area, exch, ext, fn, ln], index) = T.PARSE("(908)949-2726 Aaron Watters")
#
class Template:

   def __init__(self,
                template,
                wild_card_marker=None,
                single_char_marker=None,
                **marker_to_regex_dict):
       self.template = template
       self.wild_card = wild_card_marker
       self.char = single_char_marker
       # determine the set of markers for this template
       markers = marker_to_regex_dict.keys()
       if wild_card_marker:
          markers.append(wild_card_marker)
       if single_char_marker:
          for ch in single_char_marker: # allow multiple scm's
              markers.append(ch)
          self.char = single_char_primary = single_char_marker[0]
       self.markers = markers
       for mark in markers:
           if len(mark)>1:
              raise ValueError, "Marks must be single characters: "+`mark`
       # compile the regular expressions if needed
       self.marker_dict = marker_dict = {}
       for (mark, rgex) in marker_to_regex_dict.items():
           if type(rgex) == StringType:
              rgex = re.compile(rgex)
           marker_dict[mark] = rgex
       # determine the parse sequence
       parse_seq = []
       # dummy last char
       lastchar = None
       index = 0
       last = len(template)
       # count the number of directives encountered
       ndirectives = 0
       while index<last:
          start = index
          thischar = template[index]
          # is it a wildcard?
          if thischar == wild_card_marker:
             if lastchar == wild_card_marker:
                raise ValueError, "two wild cards in sequence is not allowed"
             parse_seq.append( (wild_card_marker, None) )
             index = index+1
             ndirectives = ndirectives+1
          # is it a sequence of single character markers?
          elif single_char_marker and thischar in single_char_marker:
             if lastchar == wild_card_marker:
                raise ValueError, "wild card cannot precede single char marker"
             while index<last and template[index] == thischar:
                index = index+1
             parse_seq.append( (single_char_primary, index-start) )
             ndirectives = ndirectives+1
          # is it a literal sequence?
          elif not thischar in markers:
             while index<last and not template[index] in markers:
                index = index+1
             parse_seq.append( (None, template[start:index]) )
          # otherwise it must be a re marker
          else:
             rgex = marker_dict[thischar]
             parse_seq.append( (thischar, rgex) )
             ndirectives = ndirectives+1
             index = index+1
          lastchar = template[index-1]
       self.parse_seq = parse_seq
       self.ndirectives = ndirectives

   def PARSE(self, str, start=0):
       ndirectives = self.ndirectives
       wild_card = self.wild_card
       single_char = self.char
       parse_seq = self.parse_seq
       lparse_seq = len(parse_seq) - 1
       # make a list long enough for substitutions for directives
       result = [None] * ndirectives
       current_directive_index = 0
       currentindex = start
       # scan through the parse sequence, recognizing
       for parse_index in xrange(lparse_seq + 1):
           (indicator, data) = parse_seq[parse_index]
           # is it a literal indicator?
           if indicator is None:
              if find(str, data, currentindex) != currentindex:
                 raise ValueError, "literal not found at "+`(currentindex,data)`
              currentindex = currentindex + len(data)
           else:
              # anything else is a directive
              # is it a wildcard?
              if indicator == wild_card:
                 # if it is the last directive then it matches the rest of the string
                 if parse_index == lparse_seq:
                    last = len(str)
                 # otherwise must look at next directive to find end of wildcard
                 else:
                    # next directive must be re or literal
                    (nextindicator, nextdata) = parse_seq[parse_index+1]
                    if nextindicator is None:
                       # search for literal
                       last = find(str, nextdata, currentindex)
                       if last<currentindex:
                          raise ValueError, \
                           "couldn't terminate wild with lit "+`currentindex`
                    else:
                       # data is a re, search for it
                       last = nextdata.search(str, currentindex)
                       if last<currentindex:
                          raise ValueError, \
                           "couldn't terminate wild with re "+`currentindex`
              elif indicator == single_char:
                 # data is length to eat
                 last = currentindex + data
              else:
                 # other directives are always regular expressions
                 last = data.match(str, currentindex) + currentindex
                 if last<currentindex:
                    raise ValueError, "couldn't match re at "+`currentindex`
              #print "accepting", str[currentindex:last]
              result[current_directive_index] = str[currentindex:last]
              current_directive_index = current_directive_index+1
              currentindex = last
       # sanity check
       if current_directive_index != ndirectives:
          raise SystemError, "not enough directives found?"
       return (result, currentindex)

# some useful regular expressions
USERNAMEREGEX = \
  "["+string.letters+"]["+string.letters+string.digits+"_]*"
STRINGLITREGEX = "'[^\n']*'"
SIMPLEINTREGEX = "["+string.digits+"]+"
id = re.compile(USERNAMEREGEX)
str = re.compile(STRINGLITREGEX)
int = re.compile(SIMPLEINTREGEX)

def test():
    global T, T1, T2, T3

    T = Template("(NNN)NNN-NNNN X X", "X", "N")
    print T.PARSE("(908)949-2726 Aaron Watters")

    T1 = Template("s --> s blah", s=str)
    s = "' <-- a string --> ' --> 'blah blah another string blah' blah"
    print T1.PARSE(s)

    T2 = Template("s --> NNNiX", "X", "N", s=str, i=int)
    print T2.PARSE("'A STRING' --> 15964653alpha beta gamma")

    T3 = Template("XsXi", "X", "N", s=str, i=int)
    print T3.PARSE("prefix'string'interior1234junk not parsed")

    T4 = Template("MMDDYYX", "X", "MDY")
    print T4.PARSE("122961 Somebody's birthday!")


if __name__=="__main__": test()
www.java2java.com | Contact Us
All other trademarks are property of their respective owners.