parser_course.small_parsers.small

89 90 """ 91 Class providing general methods for a modest class of chart 92 parsers. The idea is that a C{SmallParser} instance p1 is 93 responsible for constructing chart edges sufficient for building 94 all parse trees for the current input p1.input. This set of chart 95 edges is called a parse forest because it is an efficient densely 96 packed representation of all the parses. 97 98 99 A parse forest should be stored in the parser's 100 dtr_dict as a dictionary in the form:: 101 102 p1.dtr_dict[edge] = List of DtrLists 103 104 where each DtrList is the list of dtr edges in one 105 local subtree, For example, suppose 'see a man with a 106 telescope' has 2 parses with two local subtrees:: 107 108 vp:e1 | vp:e1 109 / \ | / \ 110 vp:e2 pp:e3 | v:e4 np:e5 111 / \ / \ | / / \ 112 / \ / \ | / / \ 113 see a man with a tel.| see a man with a tel. 114 115 Parse nodes have been annotated with the names 116 of the edge representing them. Then for a parser p1 117 that has just parsed this ambiguous sentence, 118 p1.dtr_dict[e1] is a list of 2 dtr lists:: 119 120 p1.dtr_dict[e1] = [[e2,e3],[e4,e5]] 121 122 This code provides initialization structure and parse tree 123 handling functionality for a parser meeting the above specs. 124 125 To parse string s: p1.parse_input(s). Returns a list of parse trees. 126 To parse string s with tracing on: p1.parse_input(s,True) 127 To inspect chart after parsing: p1.display_chart() 128 To inspect dtr dictionary after parsing: p1.display_dtr_dict() 129 To find the set of parse trees associated with an edge e1:: 130 131 self.get_nltk_parse_trees(e1) 132 133 To find s-edge spanning input: p1.find_spanning_edge() 134 To find tokenized data: self.input [a list] 135 To find lowercased data string: self.data [a string] 136 To see list of parse trees (after parsing): p1.parses 137 To see list of pretty printed parse trees: self.print_nltk_parses() 138 This returns set of pprinted strings as well. 139 140 To draw parse trees in a succession of Tkinter canvass windows:: 141 142 self.draw_nltk_parses() 143 144 To see if a parse exists (after parsing):: 145 146 self.parse_exists() 147 148 This returns True or False. 149 150 To find a parse edge of a given description:: 151 152 p1.find_parse_edge(...) 153 154 Arguments vary with different parsers and different notions of a 155 omplete edge description. 156 157 To draw all parses for an edge of a given description (after parsing!):: 158 159 p1.draw_nltk_parse_trees_for_edge (...) 160 """

161 - def __init__(self, grammar={},data='',trace=False):

162 self.tokenize(data) 163 self.grammar=grammar 164 self._trace = False

165

166 - def initialize_chart(self):

167 self.edge_count=0 168 ## Dictionary whose default value for unknown key is [] 169 self.dtr_dict = multivalued_dictionary.multivaluedDictionary() 170 self.parses = []

171

172 - def tokenize(self,datastring):

173 """ 174 Pretty naive tokenizer. 175 176 Lower case everything. Split on spaces. 177 """ 178 self.data = datastring.lower() 179 self.input = datastring.split()

180

181 - def reset_input (self,data):

182 if data: 183 self.tokenize(data) 184 self.initialize_chart() 185 # result of tokenization stored here 186 return self.input

187

188 - def parse_input (self,data='',trace=False):

189 self._trace=trace 190 self.reset_input(data) 191 self.process() 192 return self.find_parses()

193

194 - def process(self):

195 """Abstract method. Implements a particular 196 a particular chart parsing algorithm.""" 197 raise Exception, """process must be implemented 198 for each parse"""

199

200 - def find_spanning_edge(self):

201 """Abstract method: 202 Shd be defined to return a spanning edge 203 from which dtr edges can be recursively retrieved 204 from self.dtr_dict""" 205 raise Exception, """find_spanning_edge must be implemented 206 for each parser"""

207 208 ### Begin parse retrieval methods 209

210 - def parse_exists(self):

211 edge_info=self.find_spanning_edge() 212 if edge_info: return True 213 else: return False

214

215 - def find_nltk_parses(self):

216 parse_edge = self.find_spanning_edge() 217 if parse_edge: 218 print 'Parse found!' 219 self.parses = self.get_nltk_parse_trees(parse_edge) 220 else: 221 print 'No parses found!' 222 return False

223

224 - def get_nltk_parse_trees(self, edge):

225 result = [] 226 for dtr_list in self.dtr_dict[edge]: 227 for dtr_tree_list in\ 228 cross_multiply([self.get_nltk_parse_trees(dtr)\ 229 for dtr in dtr_list]): 230 result.append(tree.Tree(edge.mother_cat,dtr_tree_list)) 231 if result: 232 return result 233 else: 234 # No Daughters! Leaf node! 235 return [tree.Tree(edge.mother_cat,[self.input[edge.start]])]

236

237 - def find_list_parses(self):

238 parse_edge = self.find_spanning_edge() 239 if parse_edge: 240 print 'Parse found!' 241 self.parses = self.get_list_parse_trees(parse_edge) 242 else: 243 print 'No parses found!' 244 return False

245

246 - def get_list_parse_trees(self, edge):

247 """Analogue of get_nltk_parse_trees that 248 doesnt return nltk trees, just ordinary 249 list-representations of trees.""" 250 result = [] 251 for dtr_list in self.dtr_dict[edge]: 252 result += cross_multiply([[edge.mother_cat]]+\ 253 [self.get_list_parse_trees(dtr) for dtr in dtr_list]) 254 if result: 255 return result 256 else: 257 # No Daughters! Leaf node! 258 return [[edge.mother_cat,self.input[edge.start]]]

259 260 try: 261 tree 262 find_parses = find_nltk_parses 263 except NameError: 264 find_parses = find_list_parses 265 266 ### End parse retrieval methods 267

268 - def display_chart(self):

269 """Abstract method: 270 Shd be defined with parser-specific print methods 271 to print each edge in the chart. 272 """ 273 raise Exception, """display_chart must be implemented 274 for each parser"""

275

276 - def draw_nltk_parses(self):

277 for p in self.parses: 278 p.draw()

279

280 - def print_nltk_parses(self):

281 # pstrings = [p.pp() for p in self.parses] 282 for p in self.parses: 283 print p 284 print

285

286 - def draw_list_parses(self):

287 """Analogue of draw_nltk_parses for list representations of tree. 288 Note: nltk still needed.""" 289 for p in self.parses: 290 list_to_tree(p).draw()

291

292 - def tex_output_parses (self, filename, scale=1.0):

293 """ 294 This method is called as follows:: 295 296 >>> p1.tex_output_parses('parses.tex') 297 298 This creates a LaTex file which contains latex tree drawing 299 instructions for all parses of the last sentence parsed. The 300 LaTex qtree package is assumed to be installed. 301 C{self.parses} contains a list of nltk trees for which the 302 nltk method C{pprint_latdex_qtree} is called. 303 304 If the parse trees are not fitting on a page, use the optional 305 second argument of the method, which will will shrink the 306 trees using the graphics package scalebox command:: 307 308 >>> p1.tex_output_parses('parses.tex', 0.6) 309 310 prints trees shrunk to .6 their original dimensions. 311 """ 312 fh = open(filename,'w') 313 plist = self.parses 314 num_parses = len(plist) 315 print >> fh, hdr 316 print >> fh, 'There were %d parses.' % (num_parses,) 317 for (ind,t) in enumerate(self.parses): 318 ct = ind + 1 319 print >> fh 320 print >> fh, pre_filler % (ct,scale), 321 print >> fh, t.pprint_latex_qtree() 322 print >> fh, post_filler 323 # dont do these things on last parse 324 # if ct <> num_parses : 325 # pass 326 print >> fh, end 327 fh.close()

Source Code for Module parser_course.small_parsers.small_parser