fsa_recognizer.read_fst

1 """ 2 This module contains code for turning different 3 file representations of FSTs into Python FST reps: 4 a list of dictionaries (with upper-lg keys) of dictionaries 5 (with lower lgs keys). These can then be 6 used by the transducer generation and analysis 7 code in L{fsa_recognizer.fta_rec}. 8 9 The toplevel function is L{read_fst_file.make_fst_from_file}, 10 which takes a compiler instance (particular 11 to the format of the file being read) and 12 a filename argument, returning an FST. 13 14 Implemented formats include ATT graphviz ("dot") files 15 and xerox xfst ".net" files. 16 """ 17 18 import re 19 20 ################################################################ 21 ################################################################ 22 ##### Top Level Function (and utilities) 23 ################################################################ 24 ################################################################ 25

26 -def make_fst_from_file (file,Compiler):

27 """ 28 Open source file C{file}, call the appropriate 29 compiler C{CompilerInst}, and turn it into a Python dictionary based 30 represntation of an FST, suitable for the code in 31 in L{fsa_recognizer.fta_rec}. 32 33 Assumptions for the dot file: 34 - node names are string reps of integers 35 - The node numbered 0 is the start node 36 - epsilon rep is as defined in upper/lower 37 char_map. 38 39 Assumptions for FST: 40 - upper language comes FIRST in state-transition fns (STFs): 41 That is, an STF is a function from upper-language chars 42 to functions from lower lang chars to state-sets 43 (FSA transition functions on the lower language). 44 45 @param file: string (filename path) 46 @rtype fst (pair of dictionary and tuple) 47 """ 48 (_, raw_dictionary) = Compiler.read_file(file) 49 feasible_pairs = get_feasible_pairs(raw_dictionary, Compiler) 50 return make_fst(feasible_pairs, raw_dictionary, Compiler)

51

52 -def make_fst(feasible_pairs,raw_dictionary, Compiler):

53 """ 54 We assume states are string reps of ints and 55 the state numbered 0 is always initial state. 56 57 We create an FST whose state dictionaries are 58 all totally defined for the set of feasiable 59 pairs, filling in the gaps in raw_dictionary with 60 transitions to ('und',). 61 62 We sort the dictionary items according to the individual 63 compiler's cmp fn, assuming that the start state will 64 get ranked first. 65 66 We assume every state fn in raw_dic is a dictionary. 67 Two keys are special: 68 69 - 'final' takes a boolean value specifying whether this 70 is a final state. 71 - 'start' takes a boolean value specifying whether this 72 is a start state. (not used) 73 74 All other keys are upper lg chars whose values are 75 dictionaries whose keys are lower lg chars:: 76 77 raw_dic[state][upper][lower] = the set of states transitioned to 78 in teh state C{state} when upper 79 char C{char} is paired with lower lg 80 char C{lower} 81 """ 82 raw_items = raw_dictionary.items() 83 raw_items.sort(Compiler.cmp_item) 84 transitions = [] 85 finals = [] 86 for item in raw_items: 87 state_dic = {} 88 transitions.append(state_dic) 89 if item[1]['final']: 90 finals.append(item[0]) 91 for pair in feasible_pairs: 92 upper_char_dic = state_dic.setdefault(Compiler.upper_char_map[pair[0]],{}) 93 upper_char_raw_dic = item[1].get(pair[0],{}) 94 new_states = upper_char_raw_dic.get(pair[1], ('und',)) 95 upper_char_dic[Compiler.lower_char_map[pair[1]]] = new_states 96 return (transitions, tuple(finals))

97 98

99 -def get_feasible_pairs(raw_dic, Compiler):

100 """ 101 Assumptions for FST: 102 - The set of feasible pairs includes some initial 103 set passed in as C{initial_freasible_pairs}. Usually 104 this is the identity mapping for all lower case 105 alphabetic characters. 106 - All other possible mappings are instantiated 107 in the raw dictionary as pairings of chars 108 in the upper and lower alphabet. 109 Search the raw_dic rep of the FST, 110 adding any new pairs found to the initial mapping. 111 112 @param raw_dic: a dictionary 113 @rtype: a set (of pairs of chars) 114 """ 115 res = Compiler.feasible_pairs 116 for transition_func in raw_dic.values(): 117 for upper_char in transition_func.keys(): 118 ## ignore raw dic info about starting/final states 119 if upper_char not in ['start','final']: 120 # Fill out char map for non lower case chars 121 add_to_char_map_if_necessary(Compiler.upper_char_map,\ 122 upper_char) 123 for lower_char in transition_func[upper_char].keys(): 124 add_to_char_map_if_necessary(Compiler.lower_char_map,\ 125 lower_char) 126 res.add((upper_char,lower_char)) 127 # Find upper zero char 128 for item in Compiler.upper_char_map.items(): 129 if item[1] == 'epsilon': 130 upper_zero = item[0] 131 # Find lower zero char 132 for item in Compiler.lower_char_map.items(): 133 if item[1] == 'epsilon': 134 lower_zero = item[0] 135 # Hack. Add epsilon:epsilon to feasible pairs to ensure that 136 # epsilon belongs to both upper and lower alphabets 137 res.add((upper_zero,lower_zero)) 138 return res

139

140 -def add_to_char_map_if_necessary(char_map,char):

141 try: 142 char_map[char] 143 except KeyError: 144 char_map[char] = char

145

146 -def expand_char_interval(low,high):

147 """ 148 Expand a char interval rep like a..z 149 into a list of characters. 150 151 @param low: character the lower bound character in the interval 152 @param high: character the upper bound character in the interval 153 @rtype: list of characters 154 """ 155 156 code_range = range( ord(low),ord(high)+1) 157 char_list = [] 158 for code in code_range: 159 char_list.append(chr(code)) 160 return char_list

161

162 -def blank_line (line):

163 line = line.strip() 164 if re.match(blank_line_re, line): 165 return True 166 else: 167 return False

168 169 ######################################################################## 170 ### G l o b a l s 171 ######################################################################## 172 173 # Default alphabet ued by all FST compilers 174 alphabet = 'abcdefghijklmnopqrstuvwxyz' 175 176 ########################################################################### 177 ########################################################################### 178 #### C l a s s s t u f f f o r F S T f i l e c o m p i l e r s 179 ########################################################################### 180 ########################################################################### 181

182 -class FstCompiler (object):

183 """ 184 Class for compiling FST representations in various file 185 formats into equivalent raw Python dictionaries such that:: 186 187 D[state][upper_char][lower_char] = set of states 188 189 The intent of this 'raw' dictionary representation is to be a 190 modular portable representation of the FST transitions and 191 final and initial state. Alphabet and feasible_pair info is 192 stored on the compiler instance. 193 194 The raw dictionary is not suitable for the code in L{fsa_recognizer.fta_rec} 195 because it does not define a total functions on the alphabet. 196 To produce dictionaries of that form call L{fsa_recognizer.fta_rec.make_fst_from_file}, which calls an instance of this class for initial 197 file processing and then fills out the partial transition functions. 198 199 Default assumptions for FST: 200 - The set of feasible pairs includes the identity 201 mapping for all lower case alphabetic characters 202 - characters in the files are mapped to themselves 203 in the Python FST (upper/lower)_char_map 204 - default mapping to be extended case by case. 205 """ 206 207 ####### char mappings 208 ## These are mappings from the file character rep 209 ## in the upper/lower lg to 210 ## the Python FST character rep in the upper/lower lg. 211 ## epsilon rep in the file is presumably handled here. 212

213 - def __init__(self):

214 self.upper_char_map = dict(zip(alphabet,alphabet)) 215 self.lower_char_map = dict(zip(alphabet,alphabet)) 216 self.feasible_pairs = set(zip(alphabet,alphabet))

217

218 - def read_file(self, file):

219 """ 220 Abstract interface for generic read file fn. 221 222 Open FST file C{file} and return C{raw_dic}, a 223 dictionary representation of the FST represented 224 there. 225 226 The keys of the dictionary are state names, 227 possibly renamed from the original representation. 228 C{raw_dic}[state_name] is a transition function for 229 that state represented as a dictionary. 230 231 Two keys are special in transition functions: 232 233 - 'final' takes a boolean value specifying whether this 234 is a final state. 235 - 'start' takes a boolean value specifying whether this 236 is a start state. (not used) 237 238 All other keys in transition fns are upper lg chars whose values are 239 dictionaries whose keys are lower lg chars:: 240 241 raw_dic[state][upper][lower] = the set of states transitioned to 242 in the state C{state} when upper 243 char C{char} is paired with lower lg 244 char C{lower} 245 246 247 Implementations of this method should always call 248 initialize reader first, to zero out all the 249 file-specific attributes of C{self}. 250 251 @param file: file containing some representation of an FST. 252 @rtype: dictionary of dictionary of dictionaries 253 """ 254 print 'Abstract method to be implemented by subclasses'

255

256 - def initialize_reader (self):

257 """ 258 Zero out all file specific attributes before 259 compiling a new file. 260 261 Called by all implementations of C{read_file}. 262 """ 263 print 'Abstract method to be implemented by subclasses'

264 265 #################################################### 266 ### D o t R E s #### 267 #################################################### 268 269 graph_type_re = re.compile(r'digraph') 270 blank_line_re = re.compile(r'\s+$') 271 node_dec_re = re.compile(r'\s*node\s*\[(.*)\]\s*;') 272 node_name_re = re.compile(r'\s*(\w+)\s*\[(.*)\]\s*;') 273 node_connection_re = re.compile(r'\s*(\w+)\s*->\s*(\w+)\s*\[(.*)\]\s*;') 274 275 label_re = re.compile(r'[ ,\{\}]') 276 char_interval_re = re.compile(r'(.)\.\.(.)') 277 278 att_val_re = re.compile(r'\s*\,?(\w+)\s*=\s*(\d+)') 279 ## Question mark following + here specifies non-greedy 280 ## match. Does right thing when there are 2 quoted strings 281 ## in the att-val string (2 or more attributes). 282 att_val_re2 = re.compile(r'\s*\,?(\w+)\s*=\s*"(.+?)"') 283 284 #################################################### 285 ### D o t C l a s s #### 286 #################################################### 287

288 -class DotFstCompiler (FstCompiler):

289 """ 290 Code for compiling an ATT graphviz ("dot") 291 representation of an FST into a Python FST rep. 292 """ 293

294 - def __init__(self):

295 FstCompiler.__init__(self) 296 ## In the char map updates below, dot file '[]' maps to 'epsilon' 297 self.upper_char_map.update({'[]':'epsilon', '(+)':'+', '(#)':'#'}) 298 self.lower_char_map.update({'[]':'epsilon'})

299

300 - def initialize_reader(self):

301 """ 302 This function zeroes out all the records kept 303 during the reading of a fst file so that the same 304 compiler can be used on a new file safely. 305 Shd be called by read_file. 306 """ 307 self.alphabet = set([]) 308 self.node_dic = {} 309 self.gtype = ''

310 311 #### Top-level Dot File read method 312

313 - def read_file(self, file):

314 """ 315 Open dot file C{file} and return C{raw_dic}, a 316 dictionary representation of the FST represented 317 there. 318 319 We retain the state names in the dot file, but 320 we assume they are string reps of integers, an 321 assumption not b uilt into dot-files. 322 323 See interface documentation for Class FstCompiler 324 method read_file. This method respects that interface. 325 326 @param file: file containing dot representation of an FST. 327 @rtype: dictionary of dictionary of dictionaries 328 """ 329 self.initialize_reader() 330 dot_file = open(file,'r') 331 lines = dot_file.readlines() 332 dot_file.close() 333 self.gtype = self.find_graph_type(lines) 334 self.node_dic = {} 335 ## Commence parsing lines in the dot file 336 for line in lines: 337 node_dec_match = re.match(node_dec_re,line) 338 node_name_match = re.match(node_name_re,line) 339 node_connection_match = re.match(node_connection_re,line) 340 if node_dec_match: 341 pass 342 elif node_name_match: 343 node_name = node_name_match.groups()[0] 344 node_atts = node_name_match.groups()[1] 345 att_dic = self.process_atts(node_atts) 346 self.classify_node(node_name,att_dic, self.node_dic) 347 elif node_connection_match: 348 src_node_name = node_connection_match.groups()[0] 349 tgt_node_name = node_connection_match.groups()[1] 350 node_atts = node_connection_match.groups()[2] 351 att_dic = self.process_atts(node_atts) 352 self.classify_link(src_node_name,tgt_node_name,\ 353 att_dic,self.node_dic) 354 return (self.gtype, self.node_dic)

355

356 - def cmp_item (self, item1,item2):

357 """ 358 Items are pairs of state names and transition fns. 359 360 We assume state names are string reps of ints and 361 that the starting state has been named '0'. 362 """ 363 return cmp(int(item1[0]),int(item2[0]))

364

365 - def classify_node(self, node_name, att_dic, node_dic):

366 """ 367 Determine whether node_name is a start state 368 and whether it is a final state. Update C{node_dic} 369 with this info. 370 371 Assume node names are string reps of integers, 372 but map them to int keys in C{node_dic} 373 374 @param node_name: string 375 @param att_dic: dictionary (of attribute value pairs) 376 @param node_dic: dictionary (the FST we are updating) 377 @rtype: None 378 """ 379 char_dic = node_dic.setdefault(int(node_name),{}) 380 char_dic['start'] = self.is_start_node(att_dic) 381 char_dic['final'] = self.is_final_node(att_dic)

382

383 - def classify_link(self,src_node_name, tgt_node_name, att_dic, node_dic):

384 """ 385 Turn the label in a dot file string rep into 386 a series of transitions in the Python FST we are building. 387 388 C{src_node_name} is the name of the state whose transition 389 function we will be updating. 390 391 Assume node names are string reps of integers, 392 but map them to int keys in C{node-dic} 393 394 @param src_node_name: string 395 @param tgt_node_name: string 396 @param att_dic: dic represention of att value pairs attached to 397 the transition from src to tgt. 398 @param node_dic: the dictionary representation of the 399 FST to be updated 400 """ 401 label_string = att_dic['label'] 402 label_set0 = label_re.split(label_string) 403 label_set = filter(lambda y: y <> '', label_set0) # get rid of empty strings 404 src_node_dic = node_dic.setdefault(int(src_node_name),{}) 405 for l in label_set: 406 char_interval_match = re.match(char_interval_re, l) 407 try: 408 (upper_char,lower_char) = l.split(':') 409 except ValueError: 410 upper_char = lower_char = l 411 if char_interval_match: # e.g. a..z 412 for char in expand_char_interval(char_interval_match.groups()[0],\ 413 char_interval_match.groups()[1]): 414 upper_char_dic = src_node_dic.setdefault(char,{}) 415 target_nodes = upper_char_dic.setdefault(char,()) 416 upper_char_dic[char]=target_nodes+(int(tgt_node_name),) 417 else: 418 upper_char_dic = src_node_dic.setdefault(upper_char,{}) 419 target_nodes = upper_char_dic.setdefault(lower_char,()) 420 upper_char_dic[lower_char]=target_nodes+(int(tgt_node_name),)

421 422

423 - def is_start_node (self, att_dic):

424 green = False 425 try: 426 if att_dic['fillcolor'] == 'green': 427 green = True 428 elif att_dic['fillcolor'] == 'purple': 429 green = True 430 except: 431 green = False 432 try: 433 if att_dic['comment'] == 'start': 434 green = True 435 except: 436 pass 437 return green

438

439 - def is_final_node (self,att_dic):

440 red = False 441 try: 442 if att_dic['fillcolor'] == 'red': 443 red = True 444 elif att_dic['fillcolor'] == 'purple': 445 red = True 446 except: 447 red = False 448 return red

449

450 - def process_atts(self,att_string):

451 """ 452 Turn a string rep of node or transition attributes from 453 a dot file into a dictionary with keys = to attributes. 454 455 @param att_string: string 456 @rtype: dictionary 457 """ 458 equations = att_string 459 att_val_dic = {} 460 while equations: 461 att_val_match = re.match(att_val_re,equations) 462 if not att_val_match: 463 att_val_match = re.match(att_val_re2,equations) 464 if att_val_match: 465 att_val_dic[att_val_match.groups()[0]] =\ 466 att_val_match.groups()[1] 467 equations = equations[att_val_match.end():] 468 else: 469 equations = '' 470 return att_val_dic

471

472 - def find_graph_type (self,lines):

473 while lines: 474 try: 475 graph_type = lines[0] 476 except IndexError: 477 print 'No graph found!' 478 match_obj = re.match(graph_type_re, graph_type) 479 if match_obj: 480 del lines[0] 481 return match_obj.group() 482 elif blank_line(lines[0]): 483 del lines[0] 484 else: 485 print 'Unknown graph type!' 486 return None

487 488 489 490 #################################################### 491 ### X f s t R E s #### 492 #################################################### 493 494 flags_re = re.compile(r'Flags:(.*)$') 495 net_dec_re = re.compile(r'Net:') 496 sigma_dec_re = re.compile(r'Sigma:(.*)$') 497 size_dec_re = re.compile(r'Size:\s*(\d+)\,') 498 arity_dec_re = re.compile(r'Arity:\s*(\d+)') 499 state_desc_re = re.compile(r'(.?s\d+):(:?\s*(.+)\s*->\s*(.+)\.|\s*$no arcs$)') 500 501 #################################################### 502 ### X f s t C l a s s #### 503 #################################################### 504

505 -class XfstCompiler (FstCompiler):

506 """ 507 Code for compiling an Xerox xfst 508 representation of an FST into a Python FST rep. 509 """ 510

511 - def __init__(self):

512 FstCompiler.__init__(self) 513 ## In the char map updates below, dot file '[]' maps to 'epsilon' 514 self.upper_char_map.update({'0':'epsilon'}) 515 self.lower_char_map.update({'0':'epsilon'}) 516 ## For eachs state key a list of transition pairs 517 self.transition_pairs_dic = {}

518

519 - def initialize_reader(self):

520 """ 521 This function zeroes out all the records kept 522 during the reading of a fst file so that the same 523 compiler can be used on a new file safely. 524 Shd be called by read_file. 525 """ 526 self.alphabet = set([]) 527 self.node_dic = {} 528 self.gtype = '' 529 self.transition_pairs_dic = {} 530 self.name_mapping = {}

531 532 #### Top-level XFST Net File read method 533

534 - def read_file(self, file):

535 """ 536 Open xerox ".net" file C{file} and return C{raw_dic}, a 537 dictionary representation of the FST represented 538 there. 539 540 We retain the state names in the net file, 541 which determine which is a start state ('s0') 542 and which are final states (state names beginning 543 with 'fs'. 544 545 See interface documentation for Class FstCompiler 546 method read_file. This method respects that interface. 547 548 @param file: file containing xfst representation of an FST. 549 @rtype: dictionary of dictionary of dictionaries 550 """ 551 self.initialize_reader() 552 net_file = open(file,'r') 553 lines = net_file.readlines() 554 net_file.close() 555 self.gtype = 'digraph' 556 ## Commence parsing lines in the net file 557 print self.node_dic 558 print self.transition_pairs_dic 559 for line in lines: 560 print line 561 for line in lines: 562 flags_match = re.match(flags_re,line) 563 net_dec_match = re.match(net_dec_re,line) 564 sigma_dec_match = re.match(sigma_dec_re,line) 565 size_dec_match = re.match(size_dec_re,line) 566 arity_dec_match = re.match(arity_dec_re,line) 567 state_desc_match = re.match(state_desc_re,line) 568 if flags_match or net_dec_match: 569 pass 570 elif sigma_dec_match: 571 self.alphabet = set(sigma_dec_match.groups()[0].split()) 572 self.alphabet = set(filter(lambda y: y <> '', self.alphabet)) # get rid of empty strings 573 ## Handle 'other' char: '?' 574 if '?' in self.alphabet: 575 self.alphabet.remove('?') 576 # other = denotation of '?': chars not mentioned elsewhere 577 self.other = set(alphabet) - self.alphabet 578 # Now update with entire alphabet stored in global var 579 self.alphabet.update(alphabet) 580 elif size_dec_match: 581 self.size = int(size_dec_match.groups()[0].strip()) 582 elif arity_dec_match: 583 self.arity = int(arity_dec_match.groups()[0].strip()) 584 elif state_desc_match: 585 state = state_desc_match.groups()[0] 586 (StartState,FinalState,NewName) = self.process_state(state) 587 transition_function = self.node_dic.setdefault(NewName,{}) 588 transition_function['start'] = StartState 589 transition_function['final'] = FinalState 590 self.name_mapping[state] = NewName 591 transition_set = self.transition_pairs_dic.setdefault(NewName,set([])) 592 transitions = state_desc_match.groups()[1] 593 transition_set.update(self.process_transitions(state,transitions)) 594 ## Because state names are changed in the Python FST, 595 ## we must wait until all state declarations 596 ## are read to enter the transitions (and their destinations!) 597 ## in the node_dic 598 self.classify_links(self.node_dic) 599 return (self.gtype, self.node_dic)

600

601 - def process_state (self, state):

602 final = False 603 start = False 604 if state.startswith('fs'): 605 final = True 606 newstring = state[2:] 607 if state.startswith('s'): 608 newstring = state[1:] 609 if int(newstring) == 0: 610 start = True 611 return (start, final, int(newstring))

612

613 - def cmp_item(self, item1,item2):

614 return cmp(item1[0], item2[0])

615

616 - def process_transitions(self,state,transitions):

617 """ 618 C{transitions} a string of comma separated state transitions 619 of the form:: 620 621 word -> state 622 623 @rtype: a set of pairs of word and state. 624 """ 625 res = set([]) 626 for transition in transitions.split(','): 627 pair = transition.split('->') 628 if len(pair) == 2: 629 res.add((pair[0].strip('. '),pair[1].strip('. '))) 630 elif re.search(r'no arcs', transition): 631 pass 632 else: 633 print 'Unknown transition for %s: %s' % (state,transition) 634 return res

635

636 - def classify_links (self, node_dic):

637 """ 638 Add info to node dic for state and self.transition_pairs[state] 639 640 This is called after all state declarations have been read in 641 and a name mapping assigned in self.name_mapping. 642 643 @param node_dic: the dictionary representation of the 644 FST to be updated 645 """ 646 print node_dic 647 print node_dic.keys() 648 print self.transition_pairs_dic 649 for state in node_dic.keys(): 650 src_state_dic = node_dic[state] 651 for pair in self.transition_pairs_dic[state]: 652 if pair[0] == '?': 653 expanded_pairs = [] 654 for c in self.other: 655 expanded_pairs.append((c,pair[1])) 656 else: 657 expanded_pairs = [pair] 658 for p in expanded_pairs: 659 try: 660 (upper_char,lower_char) = p[0].split(':') 661 except ValueError: 662 upper_char = lower_char = p[0] 663 upper_char_dic = src_state_dic.setdefault(upper_char,{}) 664 target_nodes = upper_char_dic.setdefault(lower_char,()) 665 upper_char_dic[lower_char]=\ 666 target_nodes+(int(self.name_mapping[p[1]]),)

667

Source Code for Module fsa_recognizer.read_fst_file