Package parser_course :: Package small_parsers :: Module very_simple_cky_parser
[hide private]
[frames] | no frames]

Source Code for Module parser_course.small_parsers.very_simple_cky_parser

  1  from small_grammar import * 
  2  from small_parser import * 
  3  import supertuple 
  4  import multivalued_dictionary 
  5   
6 -class CKYParser(SmallParser):
7 """ 8 Parse strings in self.data using CKY algorithm (see SmallParser 9 methods too): 10 11 To create CKYParser instance p1 with grammar g1: p1 = CKYParser(g1) 12 To find edge(mother_cat,start,end) (after parsing!):: 13 p1.find_parse_edge(end, start,mother_cat) 14 To pretty print all parse trees after parsing:: 15 p1.print_nltk_parses() 16 To draw all parse trees after parsing (in fancy graphix window):: 17 p1.draw_nltk_parses() 18 To draw all parses for edge(mother_cat,start,end) (after parsing!):: 19 p1.draw_nltk_parse_trees_for_edge (self,end,start,mother_cat) 20 """ 21
22 - def initialize_chart(self):
23 SmallParser.initialize_chart(self) 24 max_chart = len(self.input)+1 25 chart = [] 26 self.chart = chart 27 for j in range(max_chart): 28 this_list = [] 29 self.chart.append(this_list) 30 for i in range(max_chart): 31 this_list.append({})
32
33 - def process(self):
34 for j in range(1,len(self.chart)): 35 for cat in self.grammar.lexicon[self.input[j-1]]: 36 self.add_lexical_edge(cat,j) 37 for i in range(j-2,-1,-1): 38 for k in range(i+1, j): 39 for cat in self.chart[k][i]: 40 for rule in self.grammar.find_BURules(cat): 41 cat2 = rule.next_dtr 42 if cat2 in self.chart[j][k]: 43 dtrs = (ChartEdge(k,i,cat),\ 44 ChartEdge(j,k,cat2)) 45 self.add_to_chart(j,i,rule.mother_cat,\ 46 dtrs)
47 48 49 50 ## End main body of parsing algorithm 51 52 ## Interface to chart and dtr records 53
54 - def add_to_chart(self,end,start,mother_cat,dtrs):
55 self.chart[end][start][mother_cat]=True 56 self.update_dtr_records(end,start,mother_cat, dtrs) 57 self._trace_print_binary_rule_(mother_cat,dtrs)
58
59 - def add_lexical_edge(self,cat,j):
60 self._trace_print_start_(j-1,1) 61 self.chart[j][j-1][cat]=True 62 self._trace_print_unary_rule_(cat,ChartEdge(j,j-1,\ 63 self.input[j-1]))
64
65 - def update_dtr_records(self,len, start, cat, dtrs):
66 self.dtr_dict[ChartEdge(len,start,cat)].append(dtrs)
67 68 ## End Interface to chart and dtr records 69 70 ## Edge retrieval and verification methods 71
72 - def find_parse_edge(self,end,start,mother_cat):
73 """Retrieve unique edge matching description from chart. 74 Complete edge descriptions only.""" 75 edge_info=ChartEdge(end,start,mother_cat) 76 return self.matching_edge(edge_info)
77
78 - def find_spanning_edge(self):
79 edge_info = ChartEdge(len(self.input),0,self.grammar.start_cat) 80 return self.matching_edge(edge_info)
81
82 - def matching_edge (self,edge_info):
83 """ 84 Return True if edge matching edge_info triple exists 85 in chart. Else return Fale. 86 """ 87 try: 88 self.chart[edge_info.end][edge_info.start][edge_info.mother_cat] 89 return edge_info 90 except KeyError: 91 return False
92 93 ## End edge retrieval and verification 94
95 - def display_chart(self):
96 for end in xrange(1,len(self.chart)+1): 97 print 'end[%s]' % end 98 print 99 for start in range(len(self.chart[end])): 100 print 'start %s (%s): %s' % (start,self.word_span(end,start),\ 101 self.chart[end][start]) 102 print
103 104 ## Chart like data structure with parse tree building info 105 ## In dynamic programming parlance, the backtrace table. 106
107 - def display_dtr_dict(self):
108 for m in self.dtr_dict.keys(): 109 print 110 print '%s' % m 111 m.display() 112 print ' Num parses: %s' % len(self.dtr_dict[m]) 113 parse_ctr = 0 114 for dl in self.dtr_dict[m]: 115 parse_ctr += 1 116 print 117 print ' Parse No %s:' % parse_ctr 118 for d in dl: 119 print ' %s' % d 120 d.display() 121 print
122
123 - def draw_nltk_parse_trees_for_edge (self,end,start,mother_cat):
124 edge = self.find_parse_edge(end,start,mother_cat) 125 if edge: 126 for pt in self.get_nltk_parse_trees(edge): 127 pt.draw() 128 else: 129 print 'No such edge found!'
130
131 - def _trace_print_binary_rule_(self, mother_cat,dtr_pair):
132 if self._trace: 133 self.edge_count += 1 134 print ' %s -> %s %s' % (mother_cat, dtr_pair[0].mother_cat,\ 135 dtr_pair[1].mother_cat)
136
137 - def _trace_print_unary_rule_(self, mother_cat,dtr):
138 if self._trace: 139 self.edge_count += 1 140 print' %s -> %s' % (mother_cat,dtr.mother_cat)
141
142 - def _trace_print_end_ (self,end):
143 if self._trace: 144 print 145 print 'end[%s]:' % (end,)
146
147 - def _trace_print_start_ (self,start,end):
148 if self._trace: 149 print ' start: %s (%s)' % (start, self.word_span(end,start))
150
151 - def word_span(self,end,start):
152 return ' '.join(self.input[start:start+end])
153
154 -def chart_edge_template():
155 """ 156 This template function merely returns a triple of 'tuple_attributes':: 157 >>> ChartEdgeTemplate() 158 ('end','start','mother_cat') 159 Executing the following then creates a class called ChartEdge 160 with those 3 attributes:: 161 162 >>> ChartEdge = supertuple.superTuplePlus('ChartEdge',ChartEdgeTemplate()) 163 The ChartEdge class that ChartEdgeTemplate spawns is documented here. 164 165 ChartEdges are tuples of end, start, and mother_cat, 166 with gettable attributes with those names 167 provided for each instance. 168 169 Now we create an instance of the new class:: 170 >>> e1 = ChartEdge(end= 1,start=0,mother_cat='s') 171 >>> e1 172 ChartEdge(1, 0, 's') 173 >>> e1.mother_cat 174 's' 175 This new class is a subclass of tuple and its instances 176 really are tuple instances supporting all the usual tuple methods:: 177 >>> e1[2] 178 's' 179 They just have some extras, like the keyword access illustrated 180 above and a keyword display method:: 181 >>> e1.display() 182 end: 1 183 start: 0 184 mother_cat: s 185 To test if e2 and e1 are equivalent: e1 == e2 (same as tuples) 186 To test if an edge equivalent to e1 occurs in list L:: 187 e1 in L (same as tuples) 188 To find the index of an edge equivalent to e1 in list L:: 189 L.index(e1) (same as tuples) 190 Note: Raises ValueError if L contains no such edge. 191 """ 192 return ('end','start','mother_cat')
193 194 195 ChartEdge = supertuple.superTuplePlus('ChartEdge',*chart_edge_template()) 196 197 BURuleBase = supertuple.superTuple('BURule','next_dtr','mother_cat') 198
199 -class BURule(BURuleBase):
200 """BURules are pairs of next daughters and mother cats. 201 We index our CNF rules by first dtrs, so we dont store them 202 >>> rule = BURule('v','vp') 203 >>> rule 204 BURule('v','vp') 205 >>> print rule.next_dtr, rule.mother_cat 206 v vp 207 """
208 209 #################################################################### 210 ### 211 ### C N F G r a m m a r 212 ### 213 #################################################################### 214
215 -class cnf_grammar (Grammar):
216 217 """ 218 Initialized with an arbitrary context-free grammar, 219 this turns the grammar into a cnf grammar and computes 220 a bottum up grammar suitable for efficient 221 use with a CKY parser. 222 """ 223
224 - def __init__(self, base_grammar):
225 """ 226 base grammar is a non cnf grammar 227 instance that will be used to create 228 the cnf grammar. 229 """ 230 new_grammar = make_cnf_grammar(base_grammar) 231 self.rules = new_grammar.rules 232 self.lexicon = new_grammar.lexicon 233 self.start_cat = new_grammar.start_cat 234 self.compute_nonterminals() 235 self.compute_preterminals() 236 self.compute_bottum_up_grammar()
237
238 - def compute_bottum_up_grammar (self):
239 ## Dictionary whose default value for unknown key is [] 240 self.bottum_up_grammar = \ 241 multivalued_dictionary.multivaluedDictionary() 242 for pair in self.rules.items(): 243 mother = pair[0] 244 for rule in pair[1]: 245 # index rules by first dtr of two. 246 bu_rule = BURule(rule[1],mother) 247 self.bottum_up_grammar[rule[0]].append(bu_rule)
248
249 - def find_BURules(self, cat):
250 try: 251 return self.bottum_up_grammar[cat] 252 except KeyError: 253 return []
254 255
256 -class Ctr (object):
257
258 - def __init__ (self, init_val=0):
259 self.val = init_val
260
261 - def get_val(self):
262 return self.val
263
264 - def inc_val (self, increment=1):
265 self.val += increment 266 return self.val
267
268 - def set_val (self, init_val=0):
269 self.val = init_val
270
271 -def make_cnf_grammar(base_grammar, prefix='X'):
272 """ 273 Return a new instance of class C{Grammar} 274 containing a grammar equivalent to C{base_grammar}'s grammar but 275 in Chomsky normal form. 276 277 The base_grammar class inherited from must 278 define a grammar. 279 """ 280 ## Bookkeeping structures for computing new grammar. 281 chains = multivalued_dictionary.multivaluedDictionary() 282 new_cats = dict() 283 ## Make a new instance of the same class as base_grammar 284 new_grammar = base_grammar.__class__() 285 ## The grammar mahy have instance-specific attributes, 286 ## so copy from the instance. 287 new_grammar.lexicon = base_grammar.lexicon 288 new_grammar.new_cat_prefix = prefix 289 new_grammar.start_cat = base_grammar.start_cat 290 ## Except for the rules 291 new_grammar.rules = {} 292 ## Add a counter to generate unique names for new cats. 293 new_grammar.new_cat_ctr = Ctr() 294 ## First pass at computing new grammar's rules. 295 for cat in base_grammar.rules: 296 expansions = [] 297 new_grammar.rules[cat] = expansions 298 for expansion in base_grammar.rules[cat]: 299 if len(expansion) == 1: 300 chains[cat].append(expansion[0]) 301 else: 302 process_noncnf_rule(tuple(expansion),expansions, \ 303 new_cats,new_grammar) 304 ## Add the rules for new cats. 305 for exp in new_cats: 306 update_list_dict(new_grammar.rules,new_cats[exp],list(exp)) 307 # new_grammar.rules[new_cats[exp]].append(exp) 308 ## The next step needs a deep copy of the existing rules 309 new_rules = {} 310 for k in new_grammar.rules: 311 new_rules[k] = new_grammar.rules[k][:] 312 new_grammar.compute_nonterminals() 313 ## Finally take care of chains of unary branching rules. 314 for (cat,links) in chains.items(): 315 for link in links: 316 process_chain(cat,link,chains,base_grammar,\ 317 new_grammar, new_rules,[cat]) 318 return new_grammar
319
320 -def process_chain (root_cat, current_cat,chains,\ 321 base_grammar, new_grammar,new_rules, seen):
322 if current_cat in seen: 323 raise Exception, 'Unary branching cycle for category %s' % current_cat 324 if current_cat in base_grammar.preterminals: 325 extend_preterminals(new_grammar,current_cat,root_cat) 326 if current_cat in new_rules: 327 new_grammar.rules[root_cat].extend(new_rules[current_cat]) 328 try: 329 ## OK to do destrutcive mod here? 330 seen.append(current_cat) 331 for next_cat in chains[current_cat]: 332 process_chain(root_cat,next_cat, chains,base_grammar,\ 333 new_grammar,new_rules, seen) 334 except KeyError: 335 pass
336
337 -def process_noncnf_rule(expansion,expansions, new_cats, grammar):
338 """ 339 C{new_cats} is a dictionary whose keys are tuples 340 and whose values are new categories expanding as those tuples, 341 used to prevent hypothesizing two distinct new categories 342 expanding as the same sequence. 343 """ 344 if len(expansion) == 2: 345 expansions.append(list(expansion)) 346 else: 347 # peel off first 2 cats of expansion 348 new_cat_expansion = expansion[0:2] 349 try: 350 # This expansion already used? 351 new_cat = new_cats[new_cat_expansion] 352 except KeyError: 353 # Go ahead: new expansion not seen before! 354 new_cat = make_new_cat(grammar) 355 new_cats[new_cat_expansion] = new_cat 356 process_noncnf_rule((new_cat,) + expansion[2:],\ 357 expansions, new_cats, grammar)
358
359 -def extend_preterminals (grammar,current_cat,root_cat):
360 words = grammar.lexicon.keys() 361 lexicon = grammar.lexicon 362 for w in words: 363 if current_cat in lexicon[w]: 364 lexicon[w].append(root_cat)
365
366 -def make_new_cat (grammar):
367 return grammar.new_cat_prefix + str(grammar.new_cat_ctr.inc_val())
368 369
370 -def update_list_dict(list_dict, key, element):
371 list_dict.setdefault(key,[]) 372 list_dict[key].append(element)
373 374 375 if __name__ == '__main__': 376 g0 = toy_grammar() 377 g1 = cnf_grammar(g0) 378 p1 = CKYParser(g1) 379 i1 = "the dogs in the park walk" 380 i2 = "the agency sees widespread use of the codes as a way of handling the rapidly growing mail volume and controlling labor costs" 381 i3 = "the agency sees use of the codes as a way" 382 g2 = not_so_toy_grammar() 383 g3 = cnf_grammar(g2) 384 p2 = CKYParser(g3) 385 p2.parse_input(i3,True) 386 # p2.parse_input(i2,True) 387 388 ## To parse 389 i4 = "the dogs in the park in the dogs walk" 390 # p1.parse_input(i4,True) 391 p1.parse_input(i4) 392 s_edge = ChartEdge(len(p1.input),0,p1.grammar.start_cat) 393