fsa_recognizer.fta

1 ############################################################################## 2 ### F S A T r a n s d u c e r s #### 3 ############################################################################## 4 5 """ 6 A finite state transducer (FST) recognizer. 7 8 Our example language relation is transduced sheep language. The upper 9 alphabet will be: 10 11 - {a, b, !} 12 13 The upper language will be sheep language. The lower alphabet will be: 14 15 - {1, 2, 3} 16 17 We will transduce all M{a}s into M{1}s, M{b}s into M{2}s and M{3}s and 18 pass all exclamation points through. So 19 the feasible pairs will be: 20 21 - {a:1, b:2, b:3, !:!} 22 23 We generalize a C{TransitionTable} to FSTs: an FST 24 will be a list of dictionaries of dictionaries. 25 26 For a state S{sigma} that includes the following transitions: 27 28 - S{sigma}[a:1] = (4,5) 29 - S{sigma}[b:2] = (3, ) 30 31 the corresponding state-transition function is:: 32 33 {'a': {1: (4,5)} 34 'b': {2: (3, )}} 35 36 @var einsertion_machine: transducer for the einsertion rule given in 37 (U{Jurafsky and Martin, Speech and Natural Language Processing, Fig 3.14<http://www-rohan.sdsu.edu/~gawron/compling/chap3/fig03.14.pdf>}). Minor changes 38 spelled out in the dot file in C{einsertion.dot}, from 39 which this transducer is automatically derived. 40 """ 41 42 ## Assume upper and lower tape instantiated, return True or False

43 -def nondet_transduce_recognize(string1,string2,machine):

44 """ 45 Return True if C{string1} is related to C{string2} by 46 the regular relation defined in the FST given by C{transitions} 47 and C{finals}. 48 49 Uses an agenda just as in the nondet recognition algorithm 50 in L{fsa_recognizer.nondet_fsa_rec.nondet_recognize}. However 51 an agenda search state now consists of a triple, a machine state and 52 two string positions, the position on the upper tape 53 and the position on the lower tape. 54 55 >>> nondet_transduce_recognize('baa!','211!', transduced_sheep_language) 56 (0, 0, 0)[b:2] ==> [(1, 1, 1)] 57 (1, 1, 1)[a:1] ==> [(2, 2, 2)] 58 (2, 2, 2)[a:1] ==> [(3, 3, 3), (2, 3, 3)] 59 Failing search state (2, 3, 3): Backtracking 60 (3, 3, 3)[!:!] ==> [(4, 4, 4)] 61 True 62 >>> nondet_transduce_recognize('baaa!','3111!', transduced_sheep_language) 63 (0, 0, 0)[b:3] ==> [(1, 1, 1)] 64 (1, 1, 1)[a:1] ==> [(2, 2, 2)] 65 (2, 2, 2)[a:1] ==> [(3, 3, 3), (2, 3, 3)] 66 (2, 3, 3)[a:1] ==> [(3, 4, 4), (2, 4, 4)] 67 Failing search state (2, 4, 4): Backtracking 68 (3, 4, 4)[!:!] ==> [(4, 5, 5)] 69 True 70 71 More examples: 72 73 >>> nondet_transduce_recognize('baa','211',transduced_sheep_language) 74 ... 75 False 76 >>> nondet_transduce_recognize('baa!','211!',transduced_sheep_language) 77 ... 78 True 79 >>> nondet_transduce_recognize('baaaa!','21111!',transduced_sheep_language) 80 ... 81 True 82 >>> nondet_transduce_recognize('baaaa!','2111!',transduced_sheep_language) 83 ... 84 False 85 86 The last example fails: Too many a's, not enough 1's. 87 88 >>> nondet_transduce_recognize('baaa!','21111!',transduced_sheep_language) 89 False 90 91 Too many 1's, not enough a's. 92 93 Recognition is initialized with a search consisting of the the starting 94 machine state and the start of tape1 and tape2:: 95 96 Initial search state = (starting_machine_state, beginning of tape1, 97 beginning of tape2) 98 = (0,0,0) 99 100 @param string1: upper string potentially a member of upper language 101 @param string2: lower string potentially a member of lower language 102 @param transitions: an FSTTransitionTable 103 @param finals: a tuple of final states. 104 @rtype: Boolean 105 """ 106 (transitions,finals) = machine 107 agenda = [(0,0,0)] 108 (last_index1,last_index2) = (len(string1),len(string2)) 109 current_search_state = agenda.pop() 110 while True: 111 (machine_state,tape_index1,tape_index2) = current_search_state 112 if transduce_accept_state(current_search_state,last_index1,last_index2,finals): 113 return True 114 else: 115 if tape_index1 < last_index1 and\ 116 tape_index2 < last_index2: 117 agenda.extend(transduce_analyze_new_states(current_search_state, 118 transitions,string1,string2)) 119 if agenda: 120 current_search_state = agenda.pop() 121 else: 122 return False

123

124 -def transduce_accept_state(search_state,last_index1,last_index2,finals):

125 (machine_state,tape_index1,tape_index2) = search_state 126 if tape_index1 == last_index1 and\ 127 tape_index2 == last_index2 and\ 128 machine_state in finals: 129 return True 130 else: 131 return False

132 133

134 -def transduce_analyze_new_states(search_state,transitions,string1,string2):

135 """We assume every input alphabet includes S{epsilon} 136 and therefore that every transition table is defined 137 for S{epsilon}. 138 139 We deal with S{epsilon}-transitions first, if any. Compare 140 L{fsa_recognizer.nondet_fsa_rec.generate_new_states}). 141 """ 142 (current_machine_state,current_tape_index1,current_tape_index2) = search_state 143 current_input1 = string1[current_tape_index1] 144 current_input2 = string2[current_tape_index2] 145 next_tape_index1 = current_tape_index1 + 1 146 next_tape_index2 = current_tape_index2 + 1 147 ## First grab any epsilon transitions on tape1 148 epsilon_dict_tape1=transitions[current_machine_state]['epsilon'] 149 states1 = [(machine_state,current_tape_index1,next_tape_index2) for machine_state in \ 150 epsilon_dict_tape1[current_input2] if not machine_state == 'und'] 151 if states1: 152 print '1: %s[eps][%s] ==> %s' % (search_state,current_input2,states1) 153 ## First grab any epsilon transitions on tape2 154 epsilon_dict_tape2=transitions[current_machine_state][current_input1] 155 states2 = [(machine_state,next_tape_index1,current_tape_index2) for machine_state in \ 156 epsilon_dict_tape2['epsilon'] if not machine_state == 'und'] 157 if states2: 158 print '2: %s[%s][eps] ==> %s' % (search_state,current_input1,states2) 159 ## Now grab any transitions defined for current_input 160 try: 161 new_dict_tape1=transitions[current_machine_state][current_input1] 162 other_states = [(machine_state,next_tape_index1,next_tape_index2) for machine_state in \ 163 new_dict_tape1[current_input2] if not machine_state == 'und'] 164 except: 165 # current_input not in alphabet 166 print ' %s:%s not in input alphabet' % (current_input1,current_input2) 167 other_states = [] 168 states1.extend(states2) # append the states2 onto states1 169 states1.extend(other_states) # append the other_states onto epsilon_states 170 if other_states: 171 print ' %s[%s:%s] ==> %s' % (search_state,current_input1,current_input2,other_states) 172 if not states1: 173 print ' Failing search state %s: Backtracking' % (search_state,) 174 return states1

175 176 ######################################################################## 177 ### 178 ### A n a l y z e r / G e n e r a t o r 179 ### 180 ######################################################################## 181 182 183 import fsa_recognizer.supertuple 184 185 SearchState = fsa_recognizer.supertuple.superTuplePlus('SearchState',\ 186 'state',\ 187 'tape_pos',\ 188 'related_string') 189

190 -def apply_dn(surface_string,machine, debug=False):

191 """ 192 Return the set of strings for lower langauge 193 that are paired by C{machine} with string. Return 194 the empty set if there are none. 195 196 Uses an agenda just as in the nondet recognition algorithm 197 in L{fsa_recognizer.nondet_fsa_rec.nondet_recognize}. However 198 an agenda search state now consists of a triple, a machine state and 199 a surface string position, and the lexical string constructed thus 200 far. 201 202 >>> apply_dn('baa!', transduced_sheep_language) 203 (0, 0, 0)[b:2] ==> [(1, 1, 1)] 204 (1, 1, 1)[a:1] ==> [(2, 2, 2)] 205 (2, 2, 2)[a:1] ==> [(3, 3, 3), (2, 3, 3)] 206 Failing search state (2, 3, 3): Backtracking 207 (3, 3, 3)[!:!] ==> [(4, 4, 4)] 208 {211!} 209 210 More examples: 211 212 >>> apply_dn('baa',transduced_sheep_language) 213 ... 214 {} 215 >>> apply_dn('baa!',transduced_sheep_language) 216 ... 217 {211!} 218 >>> apply_dn('baaaa!',transduced_sheep_language) 219 ... 220 {21111!} 221 222 @param surface_string: string potentially a member of surface language 223 @param transitions: an FSTTransitionTable 224 @param finals: a tuple of final states. 225 @rtype: List of strings 226 """ 227 (transitions,finals) = machine 228 transitions = make_generation_transducer(transitions,debug) 229 return find_related_string(transitions,finals,surface_string,debug)

230 ############################################################## 231

232 -def apply_up(lex_string,machine, debug=False):

233 """ 234 Return the set of strings for surface langauge 235 that are paired by C{machine} with lexical string 236 C{lex_string}. Return the empty list if there are none. 237 238 Uses an agenda just as in the nondet recognition algorithm 239 in L{fsa_recognizer.nondet_fsa_rec.nondet_recognize}. However 240 an agenda search state triple now consists of a machine state, 241 a lex string position, and the surface string constructed thus 242 far. 243 244 >>> apply_up('311!',transduced_sheep_language) 245 0[3:b] ==> 1 246 1[1:a] ==> 2 b 247 2[1:a] ==> 3 ba 248 2[1:a] ==> 2 ba 249 Failing search state SearchState(2, 3, 'baa'): Backtracking 250 3[!:!] ==> 4 baa 251 ['baa!'] 252 253 More examples: 254 255 >>> apply_up('112',transduced_sheep_language) 256 ... 257 [] 258 >>> apply_up('211!',transduced_sheep_language) 259 ... 260 [baa!] 261 >>> apply_up('21111!',transduced_sheep_language) 262 ... 263 [baaaa!] 264 265 @param lex_string: lex string potentially a member of lexical language 266 @param transitions: an FSTTransitionTable 267 @param finals: a tuple of final states. 268 @rtype: List 269 """ 270 (transitions,finals) = machine 271 transitions = make_analysis_transducer(transitions,debug) 272 return find_related_string(transitions,finals,lex_string,debug)

273 ############################################################## 274

275 -def find_related_string(transitions,finals,input_string, debug):

276 """ 277 This code assumes transitions has been pre-compiled 278 for either analysis or generation. So C{input_string} 279 may be either a surface or lexical string, and transitions 280 then represents the appropriate mapping to the set 281 of C{related_strings} to be found and returned. 282 283 Search is initialized with a search state consisting of the the starting 284 machine state and the start of input tape and singleton set 285 containing the empty string:: 286 287 Initial search state = (starting_machine_state, beginning of tape1, 288 empty_string) 289 = (0,0,'') 290 """ 291 agenda = [SearchState(state=0,tape_pos=0,related_string='')] 292 last_index = len(input_string) 293 related_strings = [] 294 while agenda: 295 current_search_state = agenda.pop() 296 (state,input_tape_pos,r_str) = current_search_state 297 if transduce_relate_accept_state(current_search_state,last_index,finals): 298 related_strings.append(r_str) 299 else: 300 agenda.extend(\ 301 transduce_relate_new_states(current_search_state,\ 302 transitions,input_string,\ 303 debug)) 304 return related_strings

305 306

307 -def transduce_relate_accept_state(search_state,last_index,finals):

308 (state,input_tape_pos,_) = search_state 309 if input_tape_pos == last_index and state in finals: 310 return True 311 else: 312 return False

313 314 epsilon_print_name = 'eps' 315

316 -def add_new_search_states (pairs_list, current_input, new_tape_position,\ 317 search_states, state, r_str, print_items):

318 """ 319 add new search states to C{search_states} using the (new_char, new_states) 320 pairs in C{pairs_list}, current input C{current_input}, 321 and the related string C{r_str} built thus far. 322 323 Also construct a list of items for printing debug info C{print_items} 324 """ 325 for (new_char, new_states) in pairs_list: 326 for new_state in new_states: 327 print_items.append((state,current_input,new_char,new_state, r_str)) 328 search_states.append(SearchState(state=new_state,\ 329 tape_pos=new_tape_position,\ 330 related_string=r_str + new_char))

331

332 -def transduce_relate_new_states(search_state,transitions,string,debug):

333 """ 334 Return a list of C{new_search_states} given the C{current_search_state}, 335 C{transitions} from the current transducer, and a surface C{string}. 336 337 C{search_state} is a triple of:: 338 (machine_state, surface_tape_position, related_string) 339 C{new_search_states} will be a list of same computed by: 340 - trying all permissible epsilon transitions for the machine 341 state in C{search_state} 342 - trying all permissible transitions for the character at 343 C{string}[C{surface_tape_position}] 344 """ 345 (state,input_tape_pos,r_str) = search_state 346 try: 347 current_input = string[input_tape_pos] 348 next_tape_pos = input_tape_pos + 1 349 except IndexError: 350 # Allow out of bound input indices for insertions at end of output 351 current_input = '' 352 next_tape_pos = input_tape_pos 353 new_search_states = [ ] 354 print_items = [ ] ## For printing info 355 ## First grab any epsilon transitions [deletions] on input_tape 356 epsilon_pairs = transitions[state]['epsilon'] 357 add_new_search_states(epsilon_pairs,epsilon_print_name,input_tape_pos,new_search_states,state,r_str,print_items) 358 ## Now do non-epsilon transitions 359 if current_input: 360 ## current_input empty when we are out of input chars 361 try: 362 tape_advancing_pairs = transitions[state][current_input] 363 except KeyError: 364 print ' %s: not in input alphabet' % (current_input,) 365 tape_advancing_pairs = [ ] 366 else: 367 tape_advancing_pairs = [ ] 368 add_new_search_states(tape_advancing_pairs, current_input, next_tape_pos, new_search_states, state, r_str, print_items) 369 ## Do some printing stuff 370 if new_search_states: 371 for new_item in print_items: 372 print ' %s[%s:%s] ==> %s %s' % new_item 373 if debug: 374 print ' >%s' % (new_search_states,) 375 if not new_search_states: 376 print ' Failing search state %s: Backtracking' % (search_state,) 377 ## Return 378 return new_search_states

379 380

381 -def make_generation_transducer(transitions,debug=False):

382 """ 383 An FST is a list of states, each represented by its 384 transducer transition fn (a dicts of dicts): 385 - [TTF1, TTF2, ...] 386 - TTF1 = {lex_char1: D1, lex_char2:D2,... } 387 Each TTFi (Transducer Transition Function) is a function from lexical 388 chars to an FSA Transition function (a function from chars to 389 sets of states). 390 391 Each embedded dictionary, call it Di, thus represents an FSA 392 transition function. 393 394 Transduce such a list of TTFs into a different list of dictionaries 395 - [TTF1', TF2', ...] 396 - TTF1' = {lex_char1: L1, lex_char2: L2, ...} 397 Where each Li is just Di.items() [the list of key-value pairs] 398 with the undefined transitions removed:: 399 Li = [pair for pair in Di.items()\ 400 if state transition of pair is not undefined] 401 """ 402 new_transitions = [] 403 for state in transitions: 404 transition_function = {} 405 new_transitions.append(transition_function) 406 for char in state.keys(): 407 transition_function[char] = \ 408 [handle_eps(pair) for pair in state[char].items()\ 409 if pair[1]<>('und',)] 410 print_transitions_table(new_transitions,debug) 411 return new_transitions

412

413 -def make_analysis_transducer(transitions,debug=False):

414 """ 415 An FST is a list of states, each represented by its 416 transducer transition fn (a dicts of dicts): 417 - [TTF1, TTF2, ...] 418 - TTF1 = {surf_char1: D1, surf_char2:D2,... } 419 Each TTFi (Transducer Transition Function) is a function from surface 420 chars to an FSA Transition function (a function from chars to 421 sets of states). 422 423 Transduce Each TTF into a dictionary TF1'; 424 - [TF1', TF2', ...] 425 - TF' = {lex_char1: L1, lex_char2: L2, ...} 426 Where each lex_chari is associated with the set of surface_chars that 427 map to it in this state and each surface char, surf_charj 428 is pairsed with the set of states that surf_charj:lex_chari map to 429 in this state. 430 """ 431 new_transitions = [] 432 for state in transitions: 433 transition_function = {} 434 new_transitions.append(transition_function) 435 for surf_char in state.keys(): 436 for lex_char in state[surf_char].keys(): 437 new_states = state[surf_char][lex_char] 438 this_pair_list = transition_function.setdefault(lex_char,[]) 439 if new_states <> ('und',): 440 this_pair_list.append(handle_eps((surf_char,new_states))) 441 print_transitions_table(new_transitions,debug) 442 return new_transitions

443

444 -def handle_eps (pair):

445 """ 446 Output epsilons are just represented as the empty 447 string. Do this transduction here. 448 """ 449 states = pair[1] 450 if pair[0] == 'epsilon': 451 new_char = '' 452 else: 453 new_char = pair[0] 454 return (new_char,states)

455 456 # Class for Verbose debugging

457 -class Verbose: pass

458

459 -def print_transitions_table(transitions,debug=True):

460 if debug == Verbose: 461 for index in range(len(transitions)): 462 print 'state: %d' % index 463 for key in transitions[index].keys(): 464 print ' %s: %s' % (key,transitions[index][key])

465 466 467 468 469 from fsa_recognizer.sheep_language import * 470 from fsa_recognizer.read_fst_file import * 471 472 if __name__ == '__main__': 473 import os 474 # You can insert a line like the next one (commented out here) to 475 # switch to the directory where you have einsertion.dot, or you can use a full pathname. 476 os.chdir('/Users/jeanmarkgawron/python/compling_intro/fsa_recognizer/fsa_recognizer') 477 comp1 = DotFstCompiler() 478 einsertion_machine = make_fst_from_file('einsertion.dot',comp1) 479 ## lowers shd be the list ['foxes'] 480 # lowers = apply_dn('fox+s#',einsertion_machine) 481 ## uppers shd be a large list of strings that includes 'xes' 482 ## with '#' and '+' scattered through various positions 483 # uppers = apply_up('foxes',einsertion_machine) 484

Source Code for Module fsa_recognizer.fta_rec