Package parser_course :: Package small_parsers :: Module small_parser
[hide private]
[frames] | no frames]

Source Code for Module parser_course.small_parsers.small_parser

  1  import multivalued_dictionary 
  2  try: 
  3      from nltk import tree 
  4  except: 
  5      print '*Warning*: nltk not found' 
  6      print 'Parse tree drawing and printing will not work.' 
  7   
  8   
  9  ################################################################# 
 10  ### 
 11  ### 
 12  ###       L a T e x    S t r i n g s 
 13  ### 
 14  ### 
 15  ################################################################# 
 16       
 17  hdr = r""" 
 18  \documentclass{article} 
 19  \usepackage{qtree} 
 20  \usepackage{array} 
 21  \usepackage{graphics} 
 22  \usepackage{lscape} 
 23  \setlength{\textwidth}{8in} 
 24   
 25  \title{Parse Trees} 
 26  \newcommand{\pythonprompt}{${\scriptstyle \rangle\!\rangle\!\rangle}$ } 
 27   
 28  \begin{document} 
 29   
 30  \maketitle 
 31  \vspace{1.5in} 
 32  \fbox{ 
 33  \rule{.1in}{0mm} 
 34  \parbox{4.5in}{ 
 35  \rule{0mm}{.2in} 
 36  This document was constructed using the tex\_output\_parses method of 
 37  the python small\_parser module (one of the modules in package {\em 
 38  small\_parsers}, written by Mark Gawron). The small\_parser module 
 39  in the small\_parsers package defines the bare-bones interface used by 
 40  the 'small' parsers in that package.  Simply parse a sentence with 
 41  any parser {\em p1} that inherits from the class 
 42  small\_parsers.small\_parser.SmallParser and call:\\[.1in] 
 43  \pythonprompt p1.tex\_output\_parses('parses.tex')\\[.1in] 
 44  and the \LaTeX{} file {\em parses.tex} will be created with tree drawing 
 45  instructions for all parses of the last sentence parsed. 
 46  The \LaTeX{} qtree package is assumed to be installed. 
 47  If the parse trees are not fitting on a page, use the optional second 
 48  argument of the method, which will will shrink the trees using the 
 49  graphics package scalebox command:\\[.1in] 
 50  \pythonprompt p1.tex\_output\_parses('parses.tex',0.6)\\[.1in] 
 51  prints trees shrunk to 0.6 their original dimensions. Use Unix command\\[.1in] 
 52  \% make\_tex parses\\[.1in] 
 53  to make a pdf file.\\ 
 54  \rule{0mm}{.2in} 
 55  }} 
 56  \newpage 
 57  \begin{landscape} 
 58   
 59         """ 
 60  pre_filler = r""" 
 61  \vspace{.1in} 
 62  \begin{tabular}[t]{@{}l} 
 63  Parse %d\\ 
 64  \hskip -3.0in 
 65  \scalebox{%.2f}{ 
 66  """ 
 67  post_filler =r""" 
 68  } 
 69  \end{tabular} 
 70  %\newpage 
 71   
 72  """ 
 73   
 74  end = r""" 
 75  \end{landscape} 
 76  \end{document} 
 77  """ 
 78   
 79  ################################################################# 
 80  ### 
 81  ### 
 82  ###      E n d    L a T e x    S t r i n g s 
 83  ### 
 84  ### 
 85  ################################################################# 
 86   
 87   
88 -class SmallParser:
89 90 """ 91 Class providing general methods for a modest class of chart 92 parsers. The idea is that a C{SmallParser} instance p1 is 93 responsible for constructing chart edges sufficient for building 94 all parse trees for the current input p1.input. This set of chart 95 edges is called a parse forest because it is an efficient densely 96 packed representation of all the parses. 97 98 99 A parse forest should be stored in the parser's 100 dtr_dict as a dictionary in the form:: 101 102 p1.dtr_dict[edge] = List of DtrLists 103 104 where each DtrList is the list of dtr edges in one 105 local subtree, For example, suppose 'see a man with a 106 telescope' has 2 parses with two local subtrees:: 107 108 vp:e1 | vp:e1 109 / \ | / \ 110 vp:e2 pp:e3 | v:e4 np:e5 111 / \ / \ | / / \ 112 / \ / \ | / / \ 113 see a man with a tel.| see a man with a tel. 114 115 Parse nodes have been annotated with the names 116 of the edge representing them. Then for a parser p1 117 that has just parsed this ambiguous sentence, 118 p1.dtr_dict[e1] is a list of 2 dtr lists:: 119 120 p1.dtr_dict[e1] = [[e2,e3],[e4,e5]] 121 122 This code provides initialization structure and parse tree 123 handling functionality for a parser meeting the above specs. 124 125 To parse string s: p1.parse_input(s). Returns a list of parse trees. 126 To parse string s with tracing on: p1.parse_input(s,True) 127 To inspect chart after parsing: p1.display_chart() 128 To inspect dtr dictionary after parsing: p1.display_dtr_dict() 129 To find the set of parse trees associated with an edge e1:: 130 131 self.get_nltk_parse_trees(e1) 132 133 To find s-edge spanning input: p1.find_spanning_edge() 134 To find tokenized data: self.input [a list] 135 To find lowercased data string: self.data [a string] 136 To see list of parse trees (after parsing): p1.parses 137 To see list of pretty printed parse trees: self.print_nltk_parses() 138 This returns set of pprinted strings as well. 139 140 To draw parse trees in a succession of Tkinter canvass windows:: 141 142 self.draw_nltk_parses() 143 144 To see if a parse exists (after parsing):: 145 146 self.parse_exists() 147 148 This returns True or False. 149 150 To find a parse edge of a given description:: 151 152 p1.find_parse_edge(...) 153 154 Arguments vary with different parsers and different notions of a 155 omplete edge description. 156 157 To draw all parses for an edge of a given description (after parsing!):: 158 159 p1.draw_nltk_parse_trees_for_edge (...) 160 """
161 - def __init__(self, grammar={},data='',trace=False):
162 self.tokenize(data) 163 self.grammar=grammar 164 self._trace = False
165
166 - def initialize_chart(self):
167 self.edge_count=0 168 ## Dictionary whose default value for unknown key is [] 169 self.dtr_dict = multivalued_dictionary.multivaluedDictionary() 170 self.parses = []
171
172 - def tokenize(self,datastring):
173 """ 174 Pretty naive tokenizer. 175 176 Lower case everything. Split on spaces. 177 """ 178 self.data = datastring.lower() 179 self.input = datastring.split()
180
181 - def reset_input (self,data):
182 if data: 183 self.tokenize(data) 184 self.initialize_chart() 185 # result of tokenization stored here 186 return self.input
187
188 - def parse_input (self,data='',trace=False):
189 self._trace=trace 190 self.reset_input(data) 191 self.process() 192 return self.find_parses()
193
194 - def process(self):
195 """Abstract method. Implements a particular 196 a particular chart parsing algorithm.""" 197 raise Exception, """process must be implemented 198 for each parse"""
199
200 - def find_spanning_edge(self):
201 """Abstract method: 202 Shd be defined to return a spanning edge 203 from which dtr edges can be recursively retrieved 204 from self.dtr_dict""" 205 raise Exception, """find_spanning_edge must be implemented 206 for each parser"""
207 208 ### Begin parse retrieval methods 209
210 - def parse_exists(self):
211 edge_info=self.find_spanning_edge() 212 if edge_info: return True 213 else: return False
214
215 - def find_nltk_parses(self):
216 parse_edge = self.find_spanning_edge() 217 if parse_edge: 218 print 'Parse found!' 219 self.parses = self.get_nltk_parse_trees(parse_edge) 220 else: 221 print 'No parses found!' 222 return False
223
224 - def get_nltk_parse_trees(self, edge):
225 result = [] 226 for dtr_list in self.dtr_dict[edge]: 227 for dtr_tree_list in\ 228 cross_multiply([self.get_nltk_parse_trees(dtr)\ 229 for dtr in dtr_list]): 230 result.append(tree.Tree(edge.mother_cat,dtr_tree_list)) 231 if result: 232 return result 233 else: 234 # No Daughters! Leaf node! 235 return [tree.Tree(edge.mother_cat,[self.input[edge.start]])]
236
237 - def find_list_parses(self):
238 parse_edge = self.find_spanning_edge() 239 if parse_edge: 240 print 'Parse found!' 241 self.parses = self.get_list_parse_trees(parse_edge) 242 else: 243 print 'No parses found!' 244 return False
245
246 - def get_list_parse_trees(self, edge):
247 """Analogue of get_nltk_parse_trees that 248 doesnt return nltk trees, just ordinary 249 list-representations of trees.""" 250 result = [] 251 for dtr_list in self.dtr_dict[edge]: 252 result += cross_multiply([[edge.mother_cat]]+\ 253 [self.get_list_parse_trees(dtr) for dtr in dtr_list]) 254 if result: 255 return result 256 else: 257 # No Daughters! Leaf node! 258 return [[edge.mother_cat,self.input[edge.start]]]
259 260 try: 261 tree 262 find_parses = find_nltk_parses 263 except NameError: 264 find_parses = find_list_parses 265 266 ### End parse retrieval methods 267
268 - def display_chart(self):
269 """Abstract method: 270 Shd be defined with parser-specific print methods 271 to print each edge in the chart. 272 """ 273 raise Exception, """display_chart must be implemented 274 for each parser"""
275
276 - def draw_nltk_parses(self):
277 for p in self.parses: 278 p.draw()
279
280 - def print_nltk_parses(self):
281 # pstrings = [p.pp() for p in self.parses] 282 for p in self.parses: 283 print p 284 print
285
286 - def draw_list_parses(self):
287 """Analogue of draw_nltk_parses for list representations of tree. 288 Note: nltk still needed.""" 289 for p in self.parses: 290 list_to_tree(p).draw()
291
292 - def tex_output_parses (self, filename, scale=1.0):
293 """ 294 This method is called as follows:: 295 296 >>> p1.tex_output_parses('parses.tex') 297 298 This creates a LaTex file which contains latex tree drawing 299 instructions for all parses of the last sentence parsed. The 300 LaTex qtree package is assumed to be installed. 301 C{self.parses} contains a list of nltk trees for which the 302 nltk method C{pprint_latdex_qtree} is called. 303 304 If the parse trees are not fitting on a page, use the optional 305 second argument of the method, which will will shrink the 306 trees using the graphics package scalebox command:: 307 308 >>> p1.tex_output_parses('parses.tex', 0.6) 309 310 prints trees shrunk to .6 their original dimensions. 311 """ 312 fh = open(filename,'w') 313 plist = self.parses 314 num_parses = len(plist) 315 print >> fh, hdr 316 print >> fh, 'There were %d parses.' % (num_parses,) 317 for (ind,t) in enumerate(self.parses): 318 ct = ind + 1 319 print >> fh 320 print >> fh, pre_filler % (ct,scale), 321 print >> fh, t.pprint_latex_qtree() 322 print >> fh, post_filler 323 # dont do these things on last parse 324 # if ct <> num_parses : 325 # pass 326 print >> fh, end 327 fh.close()
328 329 330
331 -def cross_multiply(ListList):
332 """ 333 Returns cross-product of the lists in ListList: 334 >>> cross_multiply([['a','b','c'],[1,2],['x','y','z']]) 335 [['a', 1, 'x'], ['b', 1, 'x'], ['c', 1, 'x'], 336 ['a', 2, 'x'], ['b', 2, 'x'], ['c', 2, 'x'], 337 ['a', 1, 'y'], ['b', 1, 'y'], ['c', 1, 'y'], 338 ['a', 2, 'y'], ['b', 2, 'y'], ['c', 2, 'y'], 339 ['a', 1, 'z'], ['b', 1, 'z'], ['c', 1, 'z'], 340 ['a', 2, 'z'], ['b', 2, 'z'], ['c', 2, 'z']] 341 """ 342 result = [[]] 343 for n_list in ListList: 344 result = [result_list+[n_filler] 345 for n_filler in n_list 346 for result_list in result] 347 return result
348 349 ## Used for non-nltk trees. maps lists to nltk trees, used to draw them
350 -def list_to_tree(input):
351 if isinstance(input,list): 352 if len(input) > 0: 353 dtrs = [list_to_tree(elem) for elem in input[1:]] 354 return tree.Tree(input[0],dtrs) 355 else: 356 raise Exception, 'Empty list cannot be tree!' 357 else: 358 return tree.Tree(input,[])
359