1 from small_grammar import *
2 from small_parser import *
3 import supertuple
4 import multivalued_dictionary
5
7 """
8 Parse strings in self.data using CKY algorithm (see SmallParser
9 methods too):
10
11 To create CKYParser instance p1 with grammar g1: p1 = CKYParser(g1)
12 To find edge(mother_cat,start,end) (after parsing!)::
13 p1.find_parse_edge(end, start,mother_cat)
14 To pretty print all parse trees after parsing::
15 p1.print_nltk_parses()
16 To draw all parse trees after parsing (in fancy graphix window)::
17 p1.draw_nltk_parses()
18 To draw all parses for edge(mother_cat,start,end) (after parsing!)::
19 p1.draw_nltk_parse_trees_for_edge (self,end,start,mother_cat)
20 """
21
23 SmallParser.initialize_chart(self)
24 max_chart = len(self.input)+1
25 chart = []
26 self.chart = chart
27 for j in range(max_chart):
28 this_list = []
29 self.chart.append(this_list)
30 for i in range(max_chart):
31 this_list.append({})
32
34 for j in range(1,len(self.chart)):
35 for cat in self.grammar.lexicon[self.input[j-1]]:
36 self.add_lexical_edge(cat,j)
37 for i in range(j-2,-1,-1):
38 for k in range(i+1, j):
39 for cat in self.chart[k][i]:
40 for rule in self.grammar.find_BURules(cat):
41 cat2 = rule.next_dtr
42 if cat2 in self.chart[j][k]:
43 dtrs = (ChartEdge(k,i,cat),\
44 ChartEdge(j,k,cat2))
45 self.add_to_chart(j,i,rule.mother_cat,\
46 dtrs)
47
48
49
50
51
52
53
58
64
66 self.dtr_dict[ChartEdge(len,start,cat)].append(dtrs)
67
68
69
70
71
73 """Retrieve unique edge matching description from chart.
74 Complete edge descriptions only."""
75 edge_info=ChartEdge(end,start,mother_cat)
76 return self.matching_edge(edge_info)
77
79 edge_info = ChartEdge(len(self.input),0,self.grammar.start_cat)
80 return self.matching_edge(edge_info)
81
83 """
84 Return True if edge matching edge_info triple exists
85 in chart. Else return Fale.
86 """
87 try:
88 self.chart[edge_info.end][edge_info.start][edge_info.mother_cat]
89 return edge_info
90 except KeyError:
91 return False
92
93
94
96 for end in xrange(1,len(self.chart)+1):
97 print 'end[%s]' % end
98 print
99 for start in range(len(self.chart[end])):
100 print 'start %s (%s): %s' % (start,self.word_span(end,start),\
101 self.chart[end][start])
102 print
103
104
105
106
108 for m in self.dtr_dict.keys():
109 print
110 print '%s' % m
111 m.display()
112 print ' Num parses: %s' % len(self.dtr_dict[m])
113 parse_ctr = 0
114 for dl in self.dtr_dict[m]:
115 parse_ctr += 1
116 print
117 print ' Parse No %s:' % parse_ctr
118 for d in dl:
119 print ' %s' % d
120 d.display()
121 print
122
130
132 if self._trace:
133 self.edge_count += 1
134 print ' %s -> %s %s' % (mother_cat, dtr_pair[0].mother_cat,\
135 dtr_pair[1].mother_cat)
136
138 if self._trace:
139 self.edge_count += 1
140 print' %s -> %s' % (mother_cat,dtr.mother_cat)
141
143 if self._trace:
144 print
145 print 'end[%s]:' % (end,)
146
148 if self._trace:
149 print ' start: %s (%s)' % (start, self.word_span(end,start))
150
152 return ' '.join(self.input[start:start+end])
153
155 """
156 This template function merely returns a triple of 'tuple_attributes'::
157 >>> ChartEdgeTemplate()
158 ('end','start','mother_cat')
159 Executing the following then creates a class called ChartEdge
160 with those 3 attributes::
161
162 >>> ChartEdge = supertuple.superTuplePlus('ChartEdge',ChartEdgeTemplate())
163 The ChartEdge class that ChartEdgeTemplate spawns is documented here.
164
165 ChartEdges are tuples of end, start, and mother_cat,
166 with gettable attributes with those names
167 provided for each instance.
168
169 Now we create an instance of the new class::
170 >>> e1 = ChartEdge(end= 1,start=0,mother_cat='s')
171 >>> e1
172 ChartEdge(1, 0, 's')
173 >>> e1.mother_cat
174 's'
175 This new class is a subclass of tuple and its instances
176 really are tuple instances supporting all the usual tuple methods::
177 >>> e1[2]
178 's'
179 They just have some extras, like the keyword access illustrated
180 above and a keyword display method::
181 >>> e1.display()
182 end: 1
183 start: 0
184 mother_cat: s
185 To test if e2 and e1 are equivalent: e1 == e2 (same as tuples)
186 To test if an edge equivalent to e1 occurs in list L::
187 e1 in L (same as tuples)
188 To find the index of an edge equivalent to e1 in list L::
189 L.index(e1) (same as tuples)
190 Note: Raises ValueError if L contains no such edge.
191 """
192 return ('end','start','mother_cat')
193
194
195 ChartEdge = supertuple.superTuplePlus('ChartEdge',*chart_edge_template())
196
197 BURuleBase = supertuple.superTuple('BURule','next_dtr','mother_cat')
198
200 """BURules are pairs of next daughters and mother cats.
201 We index our CNF rules by first dtrs, so we dont store them
202 >>> rule = BURule('v','vp')
203 >>> rule
204 BURule('v','vp')
205 >>> print rule.next_dtr, rule.mother_cat
206 v vp
207 """
208
209
210
211
212
213
214
216
217 """
218 Initialized with an arbitrary context-free grammar,
219 this turns the grammar into a cnf grammar and computes
220 a bottum up grammar suitable for efficient
221 use with a CKY parser.
222 """
223
237
239
240 self.bottum_up_grammar = \
241 multivalued_dictionary.multivaluedDictionary()
242 for pair in self.rules.items():
243 mother = pair[0]
244 for rule in pair[1]:
245
246 bu_rule = BURule(rule[1],mother)
247 self.bottum_up_grammar[rule[0]].append(bu_rule)
248
250 try:
251 return self.bottum_up_grammar[cat]
252 except KeyError:
253 return []
254
255
257
260
263
265 self.val += increment
266 return self.val
267
270
272 """
273 Return a new instance of class C{Grammar}
274 containing a grammar equivalent to C{base_grammar}'s grammar but
275 in Chomsky normal form.
276
277 The base_grammar class inherited from must
278 define a grammar.
279 """
280
281 chains = multivalued_dictionary.multivaluedDictionary()
282 new_cats = dict()
283
284 new_grammar = base_grammar.__class__()
285
286
287 new_grammar.lexicon = base_grammar.lexicon
288 new_grammar.new_cat_prefix = prefix
289 new_grammar.start_cat = base_grammar.start_cat
290
291 new_grammar.rules = {}
292
293 new_grammar.new_cat_ctr = Ctr()
294
295 for cat in base_grammar.rules:
296 expansions = []
297 new_grammar.rules[cat] = expansions
298 for expansion in base_grammar.rules[cat]:
299 if len(expansion) == 1:
300 chains[cat].append(expansion[0])
301 else:
302 process_noncnf_rule(tuple(expansion),expansions, \
303 new_cats,new_grammar)
304
305 for exp in new_cats:
306 update_list_dict(new_grammar.rules,new_cats[exp],list(exp))
307
308
309 new_rules = {}
310 for k in new_grammar.rules:
311 new_rules[k] = new_grammar.rules[k][:]
312 new_grammar.compute_nonterminals()
313
314 for (cat,links) in chains.items():
315 for link in links:
316 process_chain(cat,link,chains,base_grammar,\
317 new_grammar, new_rules,[cat])
318 return new_grammar
319
320 -def process_chain (root_cat, current_cat,chains,\
321 base_grammar, new_grammar,new_rules, seen):
322 if current_cat in seen:
323 raise Exception, 'Unary branching cycle for category %s' % current_cat
324 if current_cat in base_grammar.preterminals:
325 extend_preterminals(new_grammar,current_cat,root_cat)
326 if current_cat in new_rules:
327 new_grammar.rules[root_cat].extend(new_rules[current_cat])
328 try:
329
330 seen.append(current_cat)
331 for next_cat in chains[current_cat]:
332 process_chain(root_cat,next_cat, chains,base_grammar,\
333 new_grammar,new_rules, seen)
334 except KeyError:
335 pass
336
338 """
339 C{new_cats} is a dictionary whose keys are tuples
340 and whose values are new categories expanding as those tuples,
341 used to prevent hypothesizing two distinct new categories
342 expanding as the same sequence.
343 """
344 if len(expansion) == 2:
345 expansions.append(list(expansion))
346 else:
347
348 new_cat_expansion = expansion[0:2]
349 try:
350
351 new_cat = new_cats[new_cat_expansion]
352 except KeyError:
353
354 new_cat = make_new_cat(grammar)
355 new_cats[new_cat_expansion] = new_cat
356 process_noncnf_rule((new_cat,) + expansion[2:],\
357 expansions, new_cats, grammar)
358
360 words = grammar.lexicon.keys()
361 lexicon = grammar.lexicon
362 for w in words:
363 if current_cat in lexicon[w]:
364 lexicon[w].append(root_cat)
365
367 return grammar.new_cat_prefix + str(grammar.new_cat_ctr.inc_val())
368
369
371 list_dict.setdefault(key,[])
372 list_dict[key].append(element)
373
374
375 if __name__ == '__main__':
376 g0 = toy_grammar()
377 g1 = cnf_grammar(g0)
378 p1 = CKYParser(g1)
379 i1 = "the dogs in the park walk"
380 i2 = "the agency sees widespread use of the codes as a way of handling the rapidly growing mail volume and controlling labor costs"
381 i3 = "the agency sees use of the codes as a way"
382 g2 = not_so_toy_grammar()
383 g3 = cnf_grammar(g2)
384 p2 = CKYParser(g3)
385 p2.parse_input(i3,True)
386
387
388
389 i4 = "the dogs in the park in the dogs walk"
390
391 p1.parse_input(i4)
392 s_edge = ChartEdge(len(p1.input),0,p1.grammar.start_cat)
393