#
# python for parser interpretation
# Copyright Aaron Robert Watters, 1994
#
# BUGS:
# Lexical error handling is not nice
# Parse error handling is not nice
#
# Lex analysis may be slow for big grammars
# Setting case sensitivity for keywords MUST happen BEFORE
# declaration of keywords.
import kjSet
import string
import re
import string
# set this flag for regression testing at each load
RUNTESTS = 0
# set this flag to enable warning for default reductions
WARNONDEFAULTS = 0
# some local constants
TERMFLAG = -1 # FLAG FOR TERMINAL
NOMATCHFLAG = -2 # FLAG FOR NO MATCH IN FSM
MOVETOFLAG = -3 # FLAG FOR "SHIFT" IN SN FSM
REDUCEFLAG = -4 # FLAG FOR REDUCTION IN FSM
TRANSFLAG = -5 # FLAG FOR TRANSIENT STATE IN FSM
KEYFLAG = -6 # FLAG FOR KEYWORD
NONTERMFLAG = -7 # FLAG FOR NONTERMINAL
TERMFLAG = -8 # FLAG FOR TERMINAL
EOFFLAG = "*" # FLAG for End of file
# set this string to the Module name (filename)
# used for dumping reconstructable objects
THISMODULE = "kjParser"
# regular expression for matching whitespace
WHITERE = "["+string.whitespace+"]+"
WHITEREGEX = re.compile(WHITERE)
# local errors
LexTokenError = "LexTokenError" # may happen on bad string
UnkTermError = "UnkTermError" # ditto
BadPunctError= "BadPunctError" # if try to make whitespace a punct
ParseInitError = "ParseInitError" # shouldn't happen?
#EOFError # may happen on bad string
FlowError = "FlowError" # shouldn't happen!!! (bug)
#SyntaxError # may happen on bad string
#TypeError
ReductError = "ReductError" # shouldn't happen?
NondetError = "NondetError" # shouldn't happen?
# the end of file is interpreted in the lexical stream as
# a terminal...
# this should be appended to the lexical stream:
ENDOFFILETOKEN = (TERMFLAG, EOFFLAG)
# in FSM use the following terminal to indicate eof
ENDOFFILETERM = (ENDOFFILETOKEN, EOFFLAG)
# Utility function for match conversion from regex to re
def RMATCH(re, key, start=0):
#print "RMATCH: %s -> %s <- start=%s" % (re.pattern, key, start)
group = re.match(key, start)
if group is None:
#print "RMATCH: -1"
return -1
len = group.end() - group.start()
#print "RMATCH: %s (%s)" % (len, group.group())
return len
# utility function for error diagnostics
def DumpStringWindow(Str, Pos, Offset=15):
L = []
L.append("near ::")
start = Pos-Offset
end = Pos+Offset
if start<0: start = 0
if end>len(Str): end = len(Str)
L.append(`Str[start:Pos]`+"*"+`Str[Pos:end]`)
from string import join
return join(L, "\n")
# lexical dictionary class
# this data structure is used by lexical parser below.
#
# basic operations:
# LD.punctuation(string)
# registers a string as a punctuation
# EG: LD.punctuation(":")
# Punctuations are treated as a special kind of keyword
# that is recognized even when not surrounded by whitespace.
# IE, "xend" will not be recognized as "x end", but "x;" will be
# recognized as "x ;" if "end" is a regular keyword but
# ";" is a punctuation. Only single character punctuations
# are supported (now), ie, ":=" must be recognized as
# ":" "=" above the lexical level.
#
# LD.comment(compiled_reg_expression)
# registers a comment pattern
# EG LD.comment(regex.compile("--.*\n"))
# asks to recognize ansi/sql comments like "-- correct?\n"
#
# LD.keyword(keyword_string, canonicalstring)
# specifies a keyword string that should map to the canonicalstring
# when translated to the lexical stream.
# EG: LD.keyword("begin","BEGIN"); LD.keyword("BEGIN","BEGIN")
# will recognize upcase or downcase begins, not mixed case.
# (automatic upcasing is allowed below at parser level).
#
# LD[compiled_reg_expression] = (TerminalFlag, Function) # assignment!
# specifies a regular expression that should be associated
# with the lexical terminal marker TerminalFlag
# EG: LD[regex.compile("[0-9]+")] = ("integer",string.atoi)
# the Function should be a function on one string argument
# that interprets the matching string as a value. if None is
# given, just the string itself will be used as the
# interpretation. (a better choice above would be a function
# which "tries" atoi first and uses atol on overflow).
# NOTE: ambiguity among regular expressions will be decided
# arbitrarily (fix?).
#
# LD[string] # retrieval!
# returns ((KEYFLAG, Keywordstring), Keywordstring)
# if the (entire) string matches a keyword or a
# punctuation Keywordstring.
# otherwise returns ((TERMFLAG, Terminalname), value)
# if the (entire) string matches the regular expression for
# a terminal flaged by Terminalname; value is the interpreted
# value. TerminalFlag better be something other than
# KEYFLAG!
# otherwise raises an error!
# comments not filtered here!
#
# the following additional functions are used for autodocumentation
# in declaring rules, etcetera.
# begin = LD.keyword("begin")
# sets variable "begin" to (KEYFLAG, "BEGIN") if
# "begin" maps to keyword "BEGIN" in LD
# integer = LD.terminal("integer")
# sets variable integer to ("integer", Function)
# if "integer" is a registered terminal Function is
# its associated interpretation function.
#
class LexDictionary:
def __init__(self):
# commentpatterns is simply a list of compiled regular expressions
# that represent comments
self.commentpatterns = []
# commentstrings is used for debugging/dumping/reconstruction etc.
self.commentstrings = []
# punctuationlist is a string of punctuations
self.punctuationlist = ""
# keywordmap is a dictionary mapping recognized keyword strings
# and punctuations to their constant representations.
self.keywordmap = KeywordDict()
# regexprlist is a list of triples (regex,Flag,function) mapping
# regular expressions to their flag and interpreter function.
self.regexprlist = []
def Dump(self):
print "comments = ", self.commentstrings
print "punctuations = ", self.punctuationlist
print "keywordmap ="
self.keywordmap.Dump()
print "regexprlist =", self.regexprlist
def __getitem__(self,key):
# try to match string to a keyword
try:
return self.keywordmap[key]
except KeyError:
# try to match a regular expression
found = 0 # so far not found
length = len(key)
for triple in self.regexprlist:
(regexpr, Flag, Function) = triple
index = RMATCH(regexpr,key)
if index == length:
found = 1
# use the function to interpret the string, if given
if Function != None:
value = Function(key)
else:
value = key
# NONLOCAL RETURN
return (Flag, value)
#endfor
raise LexTokenError, "no match for string: " + `key`
#enddef __getitem__
# LD.keyword("this") will make a new keyword "this" if not found
#
def keyword(self,str):
# upcase the string, if needed
if self.keywordmap.caseInsensitive:
str = string.upper(str)
if not self.keywordmap.has_key(str):
# redundancy for to avoid excess construction during parsing
token = (KEYFLAG,str)
self.keywordmap[str] = (token,str)
else:
(token, str2) = self.keywordmap[str]
return token
# LD.terminal("this") will just look for "this"
# LD.terminal("this", RE, F) will register a new terminal
# RE must be a compiled regular expression or string reg ex
# F must be an interpretation function
#
def terminal(self, string, RegExpr=None, Function=None):
if RegExpr != None and Function != None:
if type(RegExpr) == type(""):
RegExpr = re.compile(RegExpr)
self[ RegExpr ] = ( string, Function)
for triple in self.regexprlist:
(regexpr,token,Function) = triple
if token[1] == string:
# nonlocal exit
return token
#endfor
# error if no exit by now
raise UnkTermError, "no such terminal"
def __setitem__(self,key,value):
if type(key) == type(''):
# if it's a string it must be a keyword
if self.keywordmap.caseInsensitive:
value = string.upper(value)
key = string.upper(key)
self.keywordmap[key] = ( (KEYFLAG, value), value)
else:
# otherwise it better be a compiled regular expression (not
#verified)
(Name, Function) = value
Flag = (TERMFLAG, Name)
regexpr = key
self.regexprlist = self.regexprlist + \
[ (regexpr, Flag, Function) ]
# register a regular expression as a comment
def comment(self, string):
# regexpr better be a uncompiled string regular expression! (not verified)
regexpr = re.compile(string)
self.commentpatterns = self.commentpatterns + [ regexpr ]
self.commentstrings = self.commentstrings + [ string ]
# register a string as a punctuation
def punctuation(self,Instring):
if type(Instring) != type("") or len(Instring)!=1:
raise BadPunctError, "punctuation must be string of length 1"
if Instring in string.whitespace:
raise BadPunctError, "punctuation may not be whitespace"
self.punctuationlist = self.punctuationlist + Instring
return self.keyword(Instring)
# testing and altering case sensitivity behavior
def isCaseSensitive(self):
return not self.keywordmap.caseInsensitive
# setting case sensitivity MUST happen before keyword
# declarations!
def SetCaseSensitivity(self, Boolean):
self.keywordmap.caseInsensitive = not Boolean
# function to do same as __getitem__ above but looking _inside_ a string
# instead of at the whole string
# returns (token,skip)
# where token is one of
# ((KEYFLAG,name),name) or ((TERMFLAG,termname),value)
# and skip is the length of substring of string that matches thetoken
def Token(self, String, StartPosition):
finished = 0 # dummy, exit should be nonlocal
totalOffset = 0
while not finished:
# flag EOF if past end of string?
if len(String) <= StartPosition:
return (ENDOFFILETERM, 0)
# skip whitespace
whitespacefound = 0
skip = RMATCH(WHITEREGEX,String, StartPosition)
if skip > 0:
StartPosition = StartPosition + skip
totalOffset = totalOffset + skip
whitespacefound = 1
# try to find comment, keyword, term in that order:
# looking for comment
commentfound = 0
for commentexpr in self.commentpatterns:
offset = RMATCH(commentexpr,String,StartPosition)
if offset != -1:
if offset<1:
info = DumpStringWindow(String,StartPosition)
raise LexTokenError, "zero length comment "+info
commentfound = 1
StartPosition = StartPosition + offset
totalOffset = totalOffset + offset
# looking for a keyword
keypair = self.keywordmap.hasPrefix(String,StartPosition,
self.punctuationlist)
if keypair != 0:
return ( keypair[0], keypair[1] + totalOffset)
# looking for terminal
for (regexpr, Flag, Function) in self.regexprlist:
offset = RMATCH(regexpr,String,StartPosition)
if offset != -1:
matchstring = String[StartPosition : offset+StartPosition]
if Function != None:
value = Function(matchstring)
else:
value = matchstring
return ((Flag, value) , offset + totalOffset)
if not (commentfound or whitespacefound):
info = DumpStringWindow(String,StartPosition)
raise LexTokenError, "Lexical parse failure "+info
#endwhile
#enddef
#endclass LexDictionary
# alternate, experimental implementation
class lexdictionary:
def __init__(self):
self.skip = ""
self.commentstrings = []
self.punctuationlist = ""
self.keywordmap = KeywordDict()
self.termlist = [] # list of (term, regex, flag, interpret_fn)
self.uncompiled = 1 # only compile after full initialization.
self.laststring= self.lastindex= self.lastresult = None
def Dump(self, *k):
raise "sorry", "not implemented"
__getitem__ = Dump
def keyword(self, str):
kwm = self.keywordmap
if kwm.caseInsensitive:
str = string.upper(str)
try:
(token, str2) = kwm[str]
except:
token = (KEYFLAG, str)
self.keywordmap[str] = (token,str)
return token
def terminal(self, str, regexstr=None, Function=None):
if regexstr is not None:
flag = (TERMFLAG, str)
self.termlist.append( (str, regexstr, flag, Function) )
return flag
else:
for (s,fl,fn) in self.termlist:
if fl[1]==str:
return fl
else:
raise UnkTermError, "no such terminal"
__setitem__ = Dump
def comment(self, str):
self.commentstrings.append(str)
def punctuation(self, Instring):
if type(Instring) != type("") or len(Instring)!=1:
raise BadPunctError, "punctuation must be string of length 1"
if Instring in string.whitespace:
raise BadPunctError, "punctuation may not be whitespace"
self.punctuationlist = self.punctuationlist + Instring
return self.keyword(Instring)
def SetCaseSensitivity(self, Boolean):
self.keywordmap.caseInsensitive = not Boolean
def Token(self, String, StartPosition):
# shortcut for reductions.
if self.laststring is String and self.lastindex == StartPosition:
#print "lastresult", self.lastresult
return self.lastresult
self.lastindex = StartPosition
self.laststring = String
#print `String[StartPosition: StartPosition+60]`
if self.uncompiled:
self.compile()
self.uncompiled = None
finished = 0
totalOffset = 0
skipprog = self.skipprog
keypairfn = self.keywordmap.hasPrefix
punctlist = self.punctuationlist
termregex = self.termregex
while not finished:
if len(String) <= StartPosition:
result = self.lastresult = (ENDOFFILETERM, 0)
return result
# skip ws and comments
#skip = skipprog.match(String, StartPosition)
skip = RMATCH(skipprog, String, StartPosition)
if skip>0:
if skip==0:
info = DumpStringWindow(String, StartPosition)
raise LexTokenError, \
"zero length whitespace or comment "+info
StartPosition = StartPosition + skip
totalOffset = totalOffset + skip
continue
# look for keyword
keypair = keypairfn(String, StartPosition, punctlist)
if keypair!=0:
#print "keyword", keypair
result = self.lastresult = (keypair[0], keypair[1]+totalOffset)
return result
# look for terminal
#print "Termregex: %s --> %s <-- start=%s" % (termregex.pattern, String, StartPosition)
offset = termregex.match(String, StartPosition)
if offset is not None:
g = offset.group
for (term, regex, flag, fn) in self.termlist:
test = g(term)
if test:
#print "terminal", test
if fn is not None:
value = fn(test)
else:
value = test
result = self.lastresult = (
(flag, value), offset.end() - offset.start() + totalOffset)
return result
# error if we get here
info = DumpStringWindow(String, StartPosition)
raise LexTokenError, "Lexical token not found "+info
def isCaseSensitive(self):
return not self.keywordmap.caseInsensitive
def compile(self):
from string import joinfields,whitespace
import re
skipregexen = self.commentstrings + [WHITERE]
skipregex = "(" + joinfields(skipregexen, ")|(") + ")"
#print skipregex; import sys; sys.exit(1)
self.skipprog = re.compile(skipregex)
termregexen = []
termnames = []
for (term, rgex, flag, fn) in self.termlist:
fragment = "(?P<%s>%s)" % (term, rgex)
termregexen.append(fragment)
termnames.append(term)
termregex = joinfields(termregexen, "|")
self.termregex = re.compile(termregex)
self.termnames = termnames
LexDictionary = lexdictionary ##### test!
# a utility class: dictionary of prefixes
# should be generalized to allow upcasing of keyword matches
class KeywordDict:
def __init__(self, caseInsensitive = 0):
self.FirstcharDict = {}
self.KeyDict = {}
self.caseInsensitive = caseInsensitive
def Dump(self):
if self.caseInsensitive:
print " case insensitive"
else:
print " case sensitive"
keys = self.KeyDict.keys()
print " keyDict has ", len(keys), " elts"
for key in keys:
print " ", key," maps to ",self.KeyDict[key]
firstchars = self.FirstcharDict.keys()
print " firstcharDict has ", len(firstchars), " elts"
for char in firstchars:
print " ", char," maps to ",self.FirstcharDict[char]
# set item assumes value has correct case already, if case sensitive
def __setitem__(self, key, value):
if len(key)<1:
raise LexTokenError, "Keyword of length 0"
if self.caseInsensitive:
KEY = string.upper(key)
else:
KEY = key
firstchar = KEY[0:1]
if self.FirstcharDict.has_key(firstchar):
self.FirstcharDict[firstchar] = \
self.FirstcharDict[firstchar] + [(KEY, value)]
else:
self.FirstcharDict[firstchar] = [(KEY, value)]
self.KeyDict[KEY] = value
# if String has a registered keyword at start position
# return its canonical representation and offset, else 0
# keywords that are not punctuations should be
# recognized only if followed
# by a punctuation or whitespace char
#
def hasPrefix(self,String,StartPosition,punctuationlist):
First = String[StartPosition:StartPosition+1]
fcd = self.FirstcharDict
caseins = self.caseInsensitive
if caseins:
First = string.upper(First)
if fcd.has_key(First):
Keylist = fcd[First]
else:
return 0
for (key,value) in Keylist:
offset = len(key)
EndPosition = StartPosition+offset
match = String[StartPosition : EndPosition]
if caseins:
match = string.upper(match)
if key == match:
if len(key)==1 and key in punctuationlist:
# punctuations are recognized regardless of nextchar
return (value,offset)
else:
# nonpuncts must have punct or whitespace following
#(uses punct as single char convention)
if EndPosition == len(String):
return (value, offset)
else:
nextchar = String[EndPosition]
if nextchar in string.whitespace\
or nextchar in punctuationlist:
return (value, offset)
return 0 # if no exit inside for loop, fail
def __getitem__(self,key):
if self.caseInsensitive:
key = string.upper(key)
return self.KeyDict[key]
def has_key(self,key):
if self.caseInsensitive:
key = string.upper(key)
return self.KeyDict.has_key(key)
#endclass KeywordDict:
# LexStringWalker walks through a string looking for
# substrings recognized by a lexical dictionary
#
# ERROR REPORTING NEEDS IMPROVEMENT
class LexStringWalker:
def __init__(self, String, LexDict):
self.Position = 0
self.NextPosition = 0
self.String = String
self.LexDict = LexDict
self.PastEOF = 0
self.Done = 0
def DUMP(self):
return DumpStringWindow(self.String,self.Position)
#reset not defined
def more(self):
return not self.PastEOF
def getmember(self):
(Token,skip) = self.LexDict.Token(self.String, self.Position)
self.NextPosition = self.Position + skip
if Token == ENDOFFILETERM:
self.PastEOF = 1
return Token
def next(self):
if self.Done:
data = self.DUMP()
raise LexTokenError, "no next past end of file "+data
elif self.PastEOF:
self.Done=1
elif self.NextPosition > self.Position:
self.Position = self.NextPosition
else:
dummy = self.getmember()
if self.NextPosition <= self.Position:
data = self.DUMP()
raise LexTokenError, "Lexical walker not advancing "+data
self.Position = self.NextPosition
#endclass LexStringWalker
# the parse class:
# Based loosely on Aho+Ullman, Principles of Compiler Design, Ch.6.
# except that they don't describe how to handle boundary
# conditions, I made them up myself.
#
# Note: This could be implemented using just functions; it's implemented
# as a class to facilitate diagnostics and debugging in case of
# failures of various sorts.
#
# a parse accepts
# a rule list
#
# a lexically analysed stream with methods
# stream.getmember() returns the current token on the stream
# stream.next() moves on to next token
# stream.more() returns false if current token is the last token
#
# and a FSM (finite state machine) with methods
# FSM.root_nonTerminal
# the nonterminal at which to start parsing
# FSM.initial_state
# the initial state to start at
# FSM.successful_final_state
# the final state to go to upon successful parse
# FSM.map(Current_State,Current_Token)
# returns either
# (TERMFLAG, 0)
# if Current_State is terminal (final or reduction).
# (NOMATCHFLAG, 0)
# if Current_State is nonterminal, but the Current_Token
# and Next_Token do not lead to a valid state in the FSM
# (MOVETOFLAG, Next_State)
# if Current_State is nonterminal and Current_Token,
# Next_token map to Next_State from Current_State.
# (REDUCEFLAG, Rulenum)
# if Current_State indicates a reduction at Current_Token
# for rule Rule number Rule
#
# and a Stack with methods (replaced with dictionary)
# (init: {-1:0} )
# Stack.Top() returns top of stack (no pop)
# ( Stack[Stack[-1]] )
# Stack.Push(Object)
# ( Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=Object )
# Stack.MakeEmpty()
# ( Stack[-1]=0 )
# Stack.IsEmpty()
# ( Stack[-1] == 0 )
# Stack.Pop()
# ( Stack[-1] = Stack[-1]-1 )
# stack contents created by Parser will be of form (State,Value)
# where Value was inserted at FSM state State.
# Value of form either (KEYFLAG, Name)
# (NontermName, reductionvalue)
# or (TerminalName, value)
#
# and an optional parameter Evaluate which if 0 indicates that
# rules should be evaluated, otherwise indicates that rules
# should just be reduced and the reduction structure should
# be used as the result of the rule
#
# rule objects must support methods
# Rule.reduce(Stack)
# pops off the elements corresponding to the body of the Rule
# from the stack and returns (NewStack,Red) where NewStack is
# the stack minus the body and Red is the result of evaluating the
# reduction function on this instance of the rule.
# Rule.Nonterm
# the nonterminal at the head of the rule
class ParserObj:
# Evaluate determines whether rules should be evaluated
# after reductions. Context is an argument passed to the
# list reduction function
#
def __init__(self, Rulelist, Stream, FSM, Stack, \
Evaluate=1, \
Context=None):
self.Rules = Rulelist
self.LexStream = Stream
self.FSM = FSM
self.Stack = Stack
self.Context = Context
# start with empty stack, initial_state, no nonterminal
#self.Stack[-1] = 0# self.Stack.MakeEmpty()
self.Stack[:] = []
self.State = FSM.initial_state
self.currentNonterm = None
self.Evaluate = Evaluate
# DoOneReduction accepts tokens from the stream and pushes
# them onto the stack until a reduction state is reached.
#
# Resolve the reduction
#
def DoOneReduction(self):
current=self.State
FSM=self.FSM
Stack = self.Stack
Context = self.Context
Stream = self.LexStream
# the internal FSM.StateTokenMap dictionary is used directly here.
STMap = FSM.StateTokenMap
#if FSM.final_state(current):
# raise ParseInitError, 'trying to reduce starting at final state'
tokenVal = Stream.getmember()
#print "tokenVal", tokenVal
token = tokenVal[0]
# push the token and traverse FSM until terminal state is reached
#(flag, nextThing) = FSM.map(current, token)
key = (current, token)
try:
(flag, nextThing) = STMap[key][0]
except KeyError:
flag = NOMATCHFLAG
while flag == MOVETOFLAG:
nextState = nextThing
#print current, " shift ", token,
# no sanity check, possible infinite loop
# push current token and next state
ThingToPush = (nextState, tokenVal)
#print "pushing ", ThingToPush
#Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush
Stack.append(ThingToPush)
#Stack.Push( ThingToPush )
# move to next token, next state
Stream.next()
# error if end of stream
if not Stream.more(): # optimized Stream.PastEOF (?)
data = Stream.DUMP()
raise EOFError, 'end of stream during parse '+data
current = nextState
tokenVal = Stream.getmember()
token = tokenVal[0]
#MAP = FSM.map(current,token)
key = (current, token)
try:
(flag, nextThing) = STMap[key][0]
except KeyError:
flag = NOMATCHFLAG
# at end of while loop we should be at a reduction state
if flag == REDUCEFLAG:
rulenum = nextThing
#print current, " reduce ", token, self.Rules[rulenum]
# normal case
# perform reduction
rule = self.Rules[rulenum]
Nonterm = rule.Nonterm
self.currentNonterm = Nonterm
(Stack, reduct) = rule.reduce( Stack , Context )
# self.Stack = Stack #not needed, unless stack rep changes
GotoState = self.GotoState(rule)
# push the Gotostate and result of rule reduction on stack
ThingToPush = (GotoState, (Nonterm, reduct) )
# push the result of the reduction and exit normally
#print "pushing ", ThingToPush
#Stack[-1]=Stack[-1]+1; Stack[Stack[-1]]=ThingToPush
Stack.append(ThingToPush)
#Stack.Push(ThingToPush)
self.State=GotoState
return 1 # normal successful completion
# some error cases
elif flag == NOMATCHFLAG:
self.ParseError(current,tokenVal, "nomatch1")
#elif FSM.final_state(current):
# raise BadFinalError, 'unexpected final state reached in reduction'
else:
data = Stream.DUMP()
s = """
flag = %s
map = %s """ % (flag, FSM.map(current,token))
data = data + s
raise FlowError, 'unexpected else '+data
#enddef DoOneReduction
# compute the state to goto after a reduction is performed
# on a rule.
# Algorithm: determine the state at beginning of reduction
# and the next state indicated by the head nonterminal of the rule.
# special case: empty stack and root nonterminal > success.
#
def GotoState(self, rule):
FSM = self.FSM
Stack = self.Stack
Head = rule.Nonterm
if len(Stack)==0: #Stack[-1]==0: #Stack.IsEmpty():
BeforeState = FSM.initial_state
else:
BeforeState = Stack[-1][0] #Stack[Stack[-1]][0] #Stack.Top()[0]
# is this right? if the stack is empty and the Head
# is the root nonterm, then goto is final state
if len(Stack)==0 and Head == FSM.root_nonTerminal:#Stack.isEmpty()
Result = FSM.successful_final_state
else:
# consider eliminating the call to .map here? (efficiency)
(flag, Result) = FSM.map(BeforeState, Head)
if flag != MOVETOFLAG:
#FSM.DUMP()
self.ParseError(BeforeState, Head, "notmoveto")
return Result
def ParseError( self, State, Token, *rest):
# make this parse error nicer (add diagnostic methods?)
L = [""]
L.append("*******************************")
L.append("current state = "+`State`)
L.append("expects: ")
expects = ""
for (flag,name) in self.FSM.Expects(State):
if flag in (TERMFLAG, KEYFLAG):
expects = expects + `name`+ ", "
L.append(expects)
L.append(`rest`)
L.append("current token = " + `Token`)
#print "Stack =",
#self.StackDump(5)
#print
from string import join
data = self.LexStream.DUMP() + join(L, "\n")
raise SyntaxError, 'unexpected token sequence.' + data
def StackDump(self, N):
Stack = self.Stack
Topkey = len(Stack)
if Topkey>N:
Start = Topkey - N
else:
Start = 1
for i in range(Start,Topkey+1):
print " :: ", Stack[i],
# execute parsing until done:
def GO(self):
while self.State != self.FSM.successful_final_state:
#self.FSM.final_state(self.State):
self.DoOneReduction()
# should I check that stack has only one elt here?
# return result of last reduction
return self.Stack[-1][1] #self.Stack.Top()[1]
#endclass ParserObj
# function for declaring a variable to represent a nonterminal:
# eg Program = nonterminal("program")
# included for convenient autodocumentation
#
def nonterminal(string):
return (NONTERMFLAG, string)
# declaring a terminal WITHOUT INSTALLING IT IN A LexDict
def termrep(string):
return (TERMFLAG, string)
# the rule class
# a rule is defined by a goal nonterminal marker of form
# (NONTERMFLAG, Name)
# and a list defining the body which must contain elts of form
# (KEYFLAG, Name) or (NONTERMFLAG, Name) of (TERMFLAG, Name)
# and a reduction function which takes a list of the same size
# as the BodyList (consisting of the results of the evaluations of
# the previous reductions)
# and returns an interpretation for the body
# the following function is used as a default reduction function
# for rules
def DefaultReductFun( RuleResultsList, Context ):
if WARNONDEFAULTS:
print "warn: default reduction."
print " ", RuleResultsList
return RuleResultsList
class ParseRule:
def __init__(self, goalNonTerm, BodyList, \
ReductFunction = DefaultReductFun):
#print BodyList
# check some of the arguments (very limited!)
if len(goalNonTerm) != 2 or goalNonTerm[0] != NONTERMFLAG:
raise TypeError, "goal of rule must be nonterminal"
for m in BodyList:
#print m
if len(m) != 2:
raise TypeError, "invalid body form for rule"
self.Nonterm = goalNonTerm
self.Body = BodyList
self.ReductFun = ReductFunction
# for dumping/reconstruction: LOSES THE INTERPRETATION FUNCTION!
def __repr__(self):
return THISMODULE + ".ParseRule" + `self.components()`
# marshal-able components of a rule
def components(self):
return (self.Nonterm, self.Body)
# rule.reduce(Stack) pops of the stack elements corresponding
# to the body of the rule and prepares the appropriate reduction
# object for evaluation (or not) at higher levels
#
def reduce(self, Stack, Context=None):
#print "reducing", Stack
Blength = len(self.Body)
#print Blength, len(self.Body)
# pop off previous results from stack corresponding to body
BodyResults = [None] * Blength
#BodyNames = [None] * Blength # for debug
#print "popping: "
for i in range(1,Blength+1):
Bindex = Blength - i # stack contents pop off in reverse order
# get and destructure the rule body entry
RuleEntry = self.Body[Bindex]
( REkind , REname ) = RuleEntry
# get and destructure the stack entry
PoppedValue = Stack[-i] #Stack.Top()
#print PoppedValue,
#del Stack[-1]# = Stack[-1]-1 #Stack.Pop()
SETokVal = PoppedValue[1]
SEvalue = SETokVal[1]
SEname = SETokVal[0][1]
# the names from rule and stack must match (?)
if SEname != REname:
print SEname, REname
print self
raise ReductError, " token names don't match"
# store the values for the reduction
BodyResults[Bindex] = SEvalue
#BodyNames[Bindex] = SEname # debug
#endfor
del Stack[len(Stack)-Blength:]
#print "reduced", Stack
#print
# evaluate the reduction, in context
reduct = self.ReductFun(BodyResults, Context)
if WARNONDEFAULTS and self.ReductFun is DefaultReductFun:
# should check whether name is defined before this...
print " default used on ", self.Name
#Reduction( self.ReductFun, BodyResults, BodyNames )
return (Stack, reduct)
#enddef ParseRule.reduce
#endclass ParseRule
# for debugging: look through a rule list
# and print names of rules that have default binding
#
def PrintDefaultBindings(rulelist):
for r in rulelist:
if r.ReductFun is DefaultReductFun:
print r.Name
# the FSM class
#
class FSMachine:
def __init__(self, rootNonTerm):
# start and success state conventions
startState=1
successState=0
self.root_nonTerminal = rootNonTerm
self.initial_state = startState
self.successful_final_state = successState
# the list of states of the FSM, implemented as a dictionary
# entries are identified by their index
# content is
# a list whose first elt is either TRANSFLAG, or TERMFLAG
# other list elts may be added by other layers (parse generator)
# indicating the kind of the state.
self.States = {}
# allocate start and success states
self.States[startState]=[TRANSFLAG]
self.States[successState]=[TERMFLAG]
# the most recently allocated state
self.maxState= startState
# the map of current token+state number to next state
#with entries of form (tokenname,state):nextstate_sequence
#
self.StateTokenMap = {}
#enddef FSM()
# ForbiddenMark is for filtering out maps to an error state
def DUMP(self, DumpMapData=1, DumpStateData=1, ForbiddenMark={}):
print "root nonterminal is ", self.root_nonTerminal
print "start at ", self.initial_state
print "end at ", self.successful_final_state
print "number of states: ", self.maxState
if DumpStateData:
print
for State in range(0,self.maxState+1):
Data = self.States[State]
print State, ": ", Data
if DumpMapData:
print
for key in self.StateTokenMap.keys():
map = self.StateTokenMap[key]
if map[0][0] == MOVETOFLAG:
ToStateData = self.States[map[0][1]]
if len(ToStateData) < 2:
Mark = None
else:
Mark = ToStateData[1]
if Mark != ForbiddenMark:
print key, " > ", map, " = ", ToStateData
else:
print key, " > reduction to rule number ", map[0][1]
# what tokens does a state expect?
def Expects(self, State):
keys = self.StateTokenMap.keys()
Tokens = kjSet.NewSet( [] )
for (state1,token) in keys:
if State == state1:
kjSet.addMember(token,Tokens)
return kjSet.get_elts(Tokens)
# "allocate" a new state of specified kind
# kind must either be TRANSFLAG, TERMFLAG or a rule object
# returns the number of the new state
def NewState(self, kind, AdditionalInfo = []):
if not kind in (TRANSFLAG,TERMFLAG,REDUCEFLAG):
raise TypeError, "unknown state kind"
available = self.maxState+1
self.States[available] = [kind] + AdditionalInfo
self.maxState = available
return available
# Install a reduction transition in the FSM:
# a reduction is represented by mapping to a rule index
# no nondeterminism is allowed.
def SetReduction(self, fromState, TokenRep, Rulenum):
key = (fromState, TokenRep)
if not self.StateTokenMap.has_key(key):
self.StateTokenMap[ key ] = ((REDUCEFLAG, Rulenum),)
else:
raise ReductError, "attempt to set ambiguous reduction"
# Install a "shift" or "goto transition in the FSM:
# supports nondeterminism by storing a sequence of possible transitions
#
def SetMap(self, fromState, TokenRep, toState):
key = (fromState, TokenRep)
if self.StateTokenMap.has_key(key):
Old = self.StateTokenMap[key]
if Old[0][0] != MOVETOFLAG:
# if the old value was not an integer, not a "normal state":
# complain:
raise NondetError, \
"attempt to make inappropriate transition ambiguous"
self.StateTokenMap[ key ] = Old + ((MOVETOFLAG,toState),)
else:
self.StateTokenMap[ key ] = ((MOVETOFLAG,toState),)
# Find the action indicated by fsm on
# (current_state, current_token) input.
#
# note: in the event of nondeterministic choice this chooses
# the first possibility listed.
# ParseObj.DoOneReduction() currently uses the internal structure
# of StateTokenMap directly, rather than using this function.
#
def map(self, current_state, current_token):
StateEntry = self.States[current_state][0]
if StateEntry == TERMFLAG:
return (TERMFLAG, 0)
elif StateEntry == TRANSFLAG:
# try to find a transition for this token and state
key = (current_state, current_token)
try:
TMap = self.StateTokenMap[key]
#print "TMap ", TMap
#print "key ", key
#print
return TMap[0]
except KeyError:
return (NOMATCHFLAG, 0)
else:
raise FlowError, "unexpected else (2)"
#enddef map
#endclass FSMachine
# the grammar class:
# a grammar consists of
# - a LexDict lexical dictionary;
# - a deterministic FSMachine;
# - a Rulelist
# and optionally a dictionary that maps Rulenames
# to Rulelist indices (used for dumping and externally)
#
class Grammar:
def __init__(self, LexD, DFA, RuleL, RuleNameDict = None):
# for auto initialization set LexD,DFA,RuleL to None
if LexD == None and DFA == None and RuleL == None:
self.LexD = LexDictionary()
# use a dummy root nonterminal -- must fix elsewhere!
self.DFA = FSMachine("ERROR")
self.RuleL = []
else:
self.LexD = LexD
self.DFA = DFA
self.RuleL = RuleL
if RuleNameDict != None:
self.AddNameDict(RuleNameDict)
self.CleanUp()
#enddef __init__
# look for default bindings
def PrintDefaults(self):
print "Default bindings on:"
PrintDefaultBindings(self.RuleL)
# setting case sensitivity: must happen before keyword installation
# in LexD.
def SetCaseSensitivity( self, Boolean ):
self.LexD.SetCaseSensitivity( Boolean )
# this may be silly, but to save some space in construction
# a token dictionary may be used that facilitates sharing of
# token representations. This method either initializes
# the dictionary or disposes of it if it exists
def CleanUp(self):
self.IndexToToken = {}
# this dictionary is used by automatically
# generated grammars to determine whether
# a string represents a nonterminal
self.NonTermDict = {}
# similarly for terminals
self.TermDict = {}
# this string may be used to keep a printable
# representation of the rules of the grammar
# (usually in automatic grammar generation
self.RuleString = ""
# to associate a token to an integer use
# self.IndexToToken[int] = tokenrep
# this method associates rules to names using a
# RuleNameDict dictionary which maps names to rule indices.
# after invocation
# self.RuleNameToIndex[ name ] gives the index
# in self.RuleL for the rule associated with name, and
# self.RuleL[index].Name gives the name associated
# with the rule self.RuleL[index]
#
def AddNameDict(self, RuleNameDict):
self.RuleNameToIndex = RuleNameDict
# add a Name attribute to the rules of the rule list
for ruleName in RuleNameDict.keys():
index = RuleNameDict[ ruleName ]
self.RuleL[ index ].Name = ruleName
# parse a string using the grammar, return result and context
def DoParse( self, String, Context = None, DoReductions = 1 ):
# construct the ParserObj
Stream = LexStringWalker( String, self.LexD )
Stack = [] # {-1:0} #Walkers.SimpleStack()
ParseOb = ParserObj( self.RuleL, Stream, self.DFA, Stack, \
DoReductions, Context )
# do the parse
ParseResult = ParseOb.GO()
# return final result of reduction and the context
return (ParseResult[1], Context)
#enddef DoParse
# parse a string using the grammar, but only return
# the result of the last reduction, without the context
def DoParse1( self, String, Context=None, DoReductions=1 ):
return self.DoParse(String, Context, DoReductions)[0]
# if the Name dictionary has been initialized
# this method will (re)bind a reduction function to
# a rule associated with Rulename
#
def Bind( self, Rulename, NewFunction ):
ruleindex = self.RuleNameToIndex[ Rulename ]
rule = self.RuleL[ ruleindex ]
rule.ReductFun = NewFunction
#enddef Bind
# bind a terminal to a regular expression and interp function
# in the lexical dictionary (convenience)
def Addterm( self, termname, regexpstr, funct ):
self.TermDict[ termname ] =\
self.LexD.terminal( termname, regexpstr, funct )
#endclass Grammar
# function to create a "null grammar"
def NullGrammar():
return Grammar(None,None,None,{})
# unmarshalling a marshalled grammar created by
# buildmodule.CGrammar.MarshalDump(Tofile)
# tightly coupled with buildmodule code...
# file should be open and "pointing to" the marshalled rep.
#
# warning: doesn't bind semantics!
#
def UnMarshalGram(file):
Grammar = NullGrammar()
UnMarshal = UnMarshaller(file, Grammar)
UnMarshal.MakeLex()
UnMarshal.MakeRules()
UnMarshal.MakeTransitions()
UnMarshal.Cleanup()
return UnMarshal.Gram
# unmarshalling object for unmarshalling grammar from a file
#
class UnMarshaller:
def __init__(self, file, Grammar):
import marshal
self.Gram = Grammar
BigList = marshal.load(file)
if type(BigList) != type([]):
raise FlowError, "bad type for unmarshalled list"
if len(BigList) != 9:
raise FlowError, "unmarshalled list of wrong size"
self.tokens = BigList[0]
self.punct = BigList[1]
self.comments = BigList[2]
self.RuleTups = BigList[3]
self.MaxStates = BigList[4]
self.reducts = BigList[5]
self.moveTos = BigList[6]
self.Root = BigList[7]
self.CaseSensitivity = BigList[8]
Grammar.SetCaseSensitivity( self.CaseSensitivity )
def MakeLex(self):
Grammar=self.Gram
LexD = Grammar.LexD
# punctuations
LexD.punctuationlist = self.punct
# comments
for commentregex in self.comments:
LexD.comment(commentregex)
#LexD.commentstring = self.comments
# keywords, terminals, nonterms
# rewrite the tokens list for sharing and extra safety
LexTokens = {}
tokens = self.tokens
for tokenindex in range(len(tokens)):
(kind,name) = tokens[tokenindex]
if kind == KEYFLAG:
tokens[tokenindex] = LexD.keyword(name)
elif not kind in [TERMFLAG, NONTERMFLAG]:
raise FlowError, "unknown token type"
# not needed
self.tokens = tokens
def MakeRules(self):
Grammar = self.Gram
Grammar.DFA.root_nonTerminal = self.Root
NameIndex = Grammar.RuleNameToIndex
RuleTuples = self.RuleTups
nRules = len(RuleTuples)
RuleList = [None] * nRules
for index in range(nRules):
(Name, Components) = RuleTuples[index]
rule = apply(ParseRule, Components)
rule.Name = Name
RuleList[index] = rule
NameIndex[Name] = index
Grammar.RuleL = RuleList
def MakeTransitions(self):
Grammar = self.Gram
DFA = Grammar.DFA
StateTokenMap = DFA.StateTokenMap
tokens = self.tokens
# record the state number
DFA.maxState = self.MaxStates
# this is historical, unfortunately... CLEAN IT UP SOMEDAY!
# THE DFA.States DICT IS NOT NEEDED (?) (here)
for state in range(1, self.MaxStates+1):
DFA.States[state] = [TRANSFLAG]
# record the reductions
for (fromState, TokenIndex, rulenum) in self.reducts:
DFA.SetReduction(fromState, tokens[TokenIndex], rulenum)
# record the transitions
for (fromState, TokenIndex, ToState) in self.moveTos:
DFA.SetMap(fromState, tokens[TokenIndex], ToState)
def Cleanup(self):
Grammar = self.Gram
Grammar.CleanUp()
################# FOLLOWING CODE IS FOR REGRESSION TESTING ONLY
################# DELETE IT IF YOU WANT/NEED
#### All tests for this module deleted, since
#### ParseBuild module tests are sufficient.
|