Package td_parser :: Module td_parser
[hide private]
[frames] | no frames]

Source Code for Module td_parser.td_parser

  1  # Ling 581: Top Down Parser 
  2  # 
  3  # Copyright (C) 2006 San Diego State University 
  4  # Author: Mark Gawron <gawron@mail.sdsu.edu> 
  5   
6 -class Parser:
7 """ 8 Class for top down recursive descent parser. 9 """ 10
11 - def __init__(self, grammar, string=''):
12 """ 13 A parser instance must have a grammar at instance creation time. 14 The terminals, productions, and start_cat for the grammar 15 are stored upon creation. 16 17 If a string is supplied it is converted to a list words 18 and stored in self.input. A string may also be supplied 19 when the recognizer- or parser- method is called. 20 """ 21 self.productions = grammar.productions 22 self.start_cat = grammar.start_cat 23 if not grammar.terminals: 24 grammar.compute_terminals() 25 self.terminals = grammar.terminals 26 self.input = self.string_to_wordlist(string) 27 self.recursion_limit=100 28 self.recursion_depth=0
29 30 ## Main body of recognizer code starts here! 31
32 - def recognize_string(self, string=''):
33 """ 34 @param string: A string. The string to be parsed. 35 Immediately converted to a [WordList].by self.legal_input. 36 37 @return: The result of calling recurse_recognize on the start-cat, 38 the empty agenda, and the wordlist generated from string. 39 """ 40 if self.legal_input(string): 41 self.recursion_depth = 0 42 return self.recurse_recognize([self.start_cat],[],self.input) 43 else: 44 return False
45
46 - def recurse_recognize(self,goals,agenda,wordlist):
47 """ 48 Parse wordlist using C{goals} (derivation). Return true 49 if current grammar accepts C{wordlist}. Else False. 50 51 Return True whenever C{goals} and C{wordlist} are empty. 52 53 Suppose C{goals} is non-empty: 54 55 If C{goals}[0] is a nonterminal, call C{expand} (expand it with the grammar 56 and continue parsing recursively). If C{goals}[0] is a terminal 57 then call C{match_word} (try to match the next word and continue parsing). 58 Both C{expand} and C{match_word} contain recursive calls to recurse_recognize. 59 60 Suppose C{goals} is empty: 61 62 Then if C{wordlist} is empty, return True. Else backtrack (If {agenda} is non-empty 63 continue parsing with the next state on the agenda; else return False) 64 backtrack also contains a recursive call to C{recurse_recognize}. 65 66 @param goals: A list of categories and/or words proposed to cover 67 wordlist, a partial derivation from the grammar. 68 @param agenda: A list of C{ParserState}s: each a pair of ([GoalList],[WordList]) 69 @param wordlist: a list of words. 70 @rtype: 71 """ 72 if len(goals) > 0: 73 next_goal = goals[0] 74 if not next_goal in self.terminals: 75 return self.expand(next_goal,goals[1:],agenda,wordlist) 76 else: 77 return self.match_word(next_goal,wordlist,goals[1:],agenda) 78 elif len(wordlist) == 0: #Success state! No goals, no words! 79 return True 80 else: 81 # Fail state 82 return self.backtrack(agenda)
83
84 - def expand(self,next_goal,rest_goals,agenda,wordlist):
85 """ 86 Expand C{next_goal} using grammar (C{self.productions}) 87 Generate new C{GoalList} using one production and add unused productions 88 to C{agenda}. Keep parsing with new C{GoalList}, new C{agenda}, old C{WordList}. 89 @param next_goal: grammar nonterminal 90 @param rest_goals: other gramar elements from the same gramar production 91 @param agenda: list of parser states. 92 @param wordlist: list of words (strings). 93 """ 94 productions = self.productions[next_goal] 95 next_production = productions[0] 96 for p in productions[1:]: 97 agenda = [ParserState(p+rest_goals,wordlist)] + agenda 98 self.trace_expand(next_goal,next_production,rest_goals, agenda,wordlist) 99 self.recursion_check() ## a bandaid for some recursion issues. 100 return self.recurse_recognize(next_production+rest_goals, agenda, wordlist)
101
102 - def match_word(self,next_goal,wordlist,rest_goals,agenda):
103 """ 104 C{next_goal} is a terminal. 105 106 Suppose C{wordlist} is non-empty: 107 108 If next_goal matches C{wordlist}[0], keep parsing with C{rest_goals}, C{agenda} 109 and C{wordlist}[1:]. 110 111 Else this parse path fails; call backtrack (if C{agenda} 112 is non-empty, continue parsing with the next parser state 113 on C{agenda}, and if C{agenda} is empty, <Return>: False) 114 115 Suppose C{wordlist} is empty. 116 117 This parse path fails. Call C{backtrack}. 118 119 @param next_goal: grammar preterminal 120 @param wordlist: list of words (strings). 121 @param rest_goals: other gramar elements from the same gramar production 122 @param agenda: list of parser states. 123 """ 124 if len(wordlist) > 0: 125 if next_goal == wordlist[0]: 126 ## We just matched a word! Discard and keep parsing rest of wordlist 127 self.trace_match(True,next_goal,wordlist[0],rest_goals,wordlist[1:]) 128 return self.recurse_recognize(rest_goals,agenda,wordlist[1:]) 129 else: 130 self.trace_match(False,next_goal,wordlist[0],rest_goals,wordlist[1:]) 131 return self.backtrack(agenda) 132 else: 133 self.trace_match(False,next_goal,'**empty**',rest_goals,[]) 134 return self.backtrack(agenda)
135 136
137 - def backtrack(self, agenda):
138 """ 139 Suppose C{agenda} is non-empty: 140 141 Then:: 142 agenda[0].GoalList = new goals 143 agenda[0].WordList = new wordlist 144 Keep parsing with new goals and new worldlist and popped agenda. 145 146 Suppose C{agenda} is empty 147 148 return False 149 """ 150 if len(agenda) > 0: 151 self.trace_backtrack(agenda[0].GoalList,agenda[0].WordList) 152 return self.recurse_recognize(agenda[0].GoalList,agenda[1:],agenda[0].WordList) 153 else: 154 return False
155 156 ### Main body of recognizer code ends here. 157
158 - def parse_string(self, string=''):
159 """ 160 Not yet implemented 161 """ 162 if self.legal_input(string): 163 print 'Not yet implemented!' 164 return False
165 166 167 ### Utility methods, tracing and type conversion below here 168
169 - def trace (self,boolean=True):
170 print boolean 171 if boolean: 172 self._trace = True 173 else: 174 self._trace = False
175
176 - def trace_match(self, boolean,next_goal,word,rest_goals,rest_wordlist):
177 if boolean and self._trace==True: 178 print 'Match succeeded: %s %s %s %s' % (next_goal, word, rest_goals,rest_wordlist) 179 elif self._trace == True: 180 print 'Match failed: %s %s %s %s' % (next_goal, word, rest_goals,rest_wordlist)
181
182 - def trace_expand(self,next_goal,next_production,goals, agenda,wordlist):
183 if self._trace==True: 184 print 'Expanding %s as %s' % (next_goal,next_production) 185 print ' Goals: %s' % (next_production+goals) 186 print ' Agenda: ' 187 for pstate in agenda: 188 print ' %s' % (pstate,)
189 190
191 - def trace_backtrack(self,goals,wordlist):
192 if self._trace == True: 193 print 'Backtracking to ' 194 print ' Goals: %s' % goals 195 print ' Wordlist: %s' % wordlist
196
197 - def string_to_wordlist (self, string):
198 return string.lower().split()
199
200 - def legal_input (self, string=''):
201 if not string: 202 self.input = self.input 203 else: 204 self.input = self.string_to_wordlist(string) 205 if self.input: 206 return True 207 else: 208 print 'Recognizer Error: No input given!' 209 return False
210
211 - def recursion_check(self):
212 if self.recursion_limit: 213 if self.recursion_depth > self.recursion_limit: 214 raise Exception('Recursion Depth exceeded!') 215 else: 216 self.recursion_depth += 1
217
218 -class Grammar:
219 """ 220 Class for Context free grammars 221 222 Grammar rules stored in self.productions 223 Start Cat stored in self.start_cat 224 Provided: a method for computing the terminals of the grammar. 225 Unchecked assumption: terminals never use upper case (built into parser) 226 """ 227
228 - def __init__(self, start_cat, trace=False):
229 """ 230 C{self.productions}: a dictionary whose keys are categories 231 and whose values are lists of productions 232 """ 233 self._trace = trace 234 self.start_cat = start_cat 235 self.productions = {} 236 self.terminals = []
237
238 - def __str__(self):
239 """ 240 If g1 is a grammar object run 'print g1' to 241 trigger this code, which builds the 242 string representation of the grammar that is printed 243 """ 244 str ='' 245 for Cat in self.productions: 246 str += Cat 247 for R in self.productions[Cat]: 248 str += '\t=> ' 249 for dtr in R: 250 str += '%s ' % dtr 251 str += '\n' 252 str += '\t ------\n' 253 return str
254 255
256 - def add_production (self, cat, production):
257 self.productions[cat]=self.productions.get(cat,[])+[production] 258 if self.terminals: 259 self.compute_terminals() 260 return production
261
262 - def compute_terminals(self):
263 """ 264 Place a set of grammar terminals in self.terminals 265 266 Assumption: No symbol that ever occurs in the LHS 267 of a production ever needs to occur in an input string. 268 269 Counterexample: If 's' is the start symbol of the grammar 270 it is also the possession-marking 'word' in English. 271 """ 272 terminals=[] 273 for cat in self.productions: 274 for p in self.productions[cat]: 275 for d in p: 276 if not d in self.productions: 277 terminals+=[d] 278 self.terminals = terminals
279
280 -class ParserState:
281 """ 282 A parser state is basically a pair of a goallist (derivation) 283 and a wordlist, with a Pythonic class instance wrapper. 284 """ 285
286 - def __init__(self,GoalList,WordList):
287 self.GoalList = GoalList 288 self.WordList = WordList
289
290 - def __str__(self):
291 """ 292 Return nice string rep. 293 294 Look like a pair! 295 Because that's what you are! 296 @rtype: string 297 """ 298 return "(%s, %s)" % (self.GoalList,self.WordList)
299
300 - def __getitem__(self, i):
301 """ 302 This is a little silly! Purely for illustration! 303 Allows ParserState instances to be accessed using 304 the usual Python sequence syntax 305 306 >>> ps = ParserState(['S'],['a', 'dog', 'walks']) 307 >>> ps[0] 308 ['S] 309 >>> ps[1] 310 ['a', 'dog', 'walks'] 311 """ 312 if i == 0: 313 return self.GoalList 314 elif i == 1: 315 return self.WordList 316 else: 317 raise Exception('Illegal index for parse state!')
318 319 ########################################################################### 320 ### Load time execution code starts here 321 ########################################################################### 322
323 -def demo (strings):
324 global g1, p1, g2, p2 325 ## The first grammar 326 g1 = Grammar('s') 327 g1.add_production('s',['np','vp']) 328 g1.add_production('s',['vp']) 329 g1.add_production('vp',['v']) 330 g1.add_production('vp',['v','np']) 331 g1.add_production('np',['pname']) 332 g1.add_production('pname',['john']) 333 g1.add_production('pname',['mary']) 334 g1.add_production('np',['d','n']) 335 g1.add_production('d',['the']) 336 g1.add_production('n',['boy']) 337 g1.add_production('n',['girl']) 338 g1.add_production('n',['beans']) 339 g1.add_production('v',['likes']) 340 g1.add_production('v',['run']) 341 g1.add_production('v',['eat']) 342 p1 = Parser(g1) 343 p1.trace() 344 ## The second grammar 345 g2 = Grammar('s') 346 g2.add_production('s',['np','vp']) 347 g2.add_production('s',['vp']) 348 g2.add_production('vp',['v']) 349 g2.add_production('vp',['v','np']) 350 g2.add_production('np',['pname']) 351 g2.add_production('pname',['john']) 352 g2.add_production('pname',['mary']) 353 g2.add_production('np',['d','n']) 354 g2.add_production('d',['the']) 355 g2.add_production('n',['boy']) 356 g2.add_production('n',['girl']) 357 g2.add_production('n',['tree']) 358 g2.add_production('n',['flowers']) 359 g2.add_production('n',['beans']) 360 g2.add_production('v',['likes']) 361 g2.add_production('v',['run']) 362 g2.add_production('v',['eat']) 363 g2.add_production('p',['with']) 364 g2.add_production('pp',['p','np']) 365 g2.add_production('np',['np','pp']) 366 p2 = Parser(g2) 367 p2.trace() 368 369 for s in strings: 370 print s 371 print '---------' 372 print p1.recognize_string(s) 373 print
374 375 376 if __name__ == '__main__': 377 strings = ['John likes Mary', 378 'The boy likes the girl', 379 'Eat beans'] 380 demo(strings) 381