1
2
3
4
5 """
6 A finite state transducer (FST) recognizer.
7
8 Our example language relation is transduced sheep language. The upper
9 alphabet will be:
10
11 - {a, b, !}
12
13 The upper language will be sheep language. The lower alphabet will be:
14
15 - {1, 2, 3}
16
17 We will transduce all M{a}s into M{1}s, M{b}s into M{2}s and M{3}s and
18 pass all exclamation points through. So
19 the feasible pairs will be:
20
21 - {a:1, b:2, b:3, !:!}
22
23 We generalize a C{TransitionTable} to FSTs: an FST
24 will be a list of dictionaries of dictionaries.
25
26 For a state S{sigma} that includes the following transitions:
27
28 - S{sigma}[a:1] = (4,5)
29 - S{sigma}[b:2] = (3, )
30
31 the corresponding state-transition function is::
32
33 {'a': {1: (4,5)}
34 'b': {2: (3, )}}
35
36 @var einsertion_machine: transducer for the einsertion rule given in
37 (U{Jurafsky and Martin, Speech and Natural Language Processing, Fig 3.14<http://www-rohan.sdsu.edu/~gawron/compling/chap3/fig03.14.pdf>}). Minor changes
38 spelled out in the dot file in C{einsertion.dot}, from
39 which this transducer is automatically derived.
40 """
41
42
44 """
45 Return True if C{string1} is related to C{string2} by
46 the regular relation defined in the FST given by C{transitions}
47 and C{finals}.
48
49 Uses an agenda just as in the nondet recognition algorithm
50 in L{fsa_recognizer.nondet_fsa_rec.nondet_recognize}. However
51 an agenda search state now consists of a triple, a machine state and
52 two string positions, the position on the upper tape
53 and the position on the lower tape.
54
55 >>> nondet_transduce_recognize('baa!','211!', transduced_sheep_language)
56 (0, 0, 0)[b:2] ==> [(1, 1, 1)]
57 (1, 1, 1)[a:1] ==> [(2, 2, 2)]
58 (2, 2, 2)[a:1] ==> [(3, 3, 3), (2, 3, 3)]
59 Failing search state (2, 3, 3): Backtracking
60 (3, 3, 3)[!:!] ==> [(4, 4, 4)]
61 True
62 >>> nondet_transduce_recognize('baaa!','3111!', transduced_sheep_language)
63 (0, 0, 0)[b:3] ==> [(1, 1, 1)]
64 (1, 1, 1)[a:1] ==> [(2, 2, 2)]
65 (2, 2, 2)[a:1] ==> [(3, 3, 3), (2, 3, 3)]
66 (2, 3, 3)[a:1] ==> [(3, 4, 4), (2, 4, 4)]
67 Failing search state (2, 4, 4): Backtracking
68 (3, 4, 4)[!:!] ==> [(4, 5, 5)]
69 True
70
71 More examples:
72
73 >>> nondet_transduce_recognize('baa','211',transduced_sheep_language)
74 ...
75 False
76 >>> nondet_transduce_recognize('baa!','211!',transduced_sheep_language)
77 ...
78 True
79 >>> nondet_transduce_recognize('baaaa!','21111!',transduced_sheep_language)
80 ...
81 True
82 >>> nondet_transduce_recognize('baaaa!','2111!',transduced_sheep_language)
83 ...
84 False
85
86 The last example fails: Too many a's, not enough 1's.
87
88 >>> nondet_transduce_recognize('baaa!','21111!',transduced_sheep_language)
89 False
90
91 Too many 1's, not enough a's.
92
93 Recognition is initialized with a search consisting of the the starting
94 machine state and the start of tape1 and tape2::
95
96 Initial search state = (starting_machine_state, beginning of tape1,
97 beginning of tape2)
98 = (0,0,0)
99
100 @param string1: upper string potentially a member of upper language
101 @param string2: lower string potentially a member of lower language
102 @param transitions: an FSTTransitionTable
103 @param finals: a tuple of final states.
104 @rtype: Boolean
105 """
106 (transitions,finals) = machine
107 agenda = [(0,0,0)]
108 (last_index1,last_index2) = (len(string1),len(string2))
109 current_search_state = agenda.pop()
110 while True:
111 (machine_state,tape_index1,tape_index2) = current_search_state
112 if transduce_accept_state(current_search_state,last_index1,last_index2,finals):
113 return True
114 else:
115 if tape_index1 < last_index1 and\
116 tape_index2 < last_index2:
117 agenda.extend(transduce_analyze_new_states(current_search_state,
118 transitions,string1,string2))
119 if agenda:
120 current_search_state = agenda.pop()
121 else:
122 return False
123
125 (machine_state,tape_index1,tape_index2) = search_state
126 if tape_index1 == last_index1 and\
127 tape_index2 == last_index2 and\
128 machine_state in finals:
129 return True
130 else:
131 return False
132
133
135 """We assume every input alphabet includes S{epsilon}
136 and therefore that every transition table is defined
137 for S{epsilon}.
138
139 We deal with S{epsilon}-transitions first, if any. Compare
140 L{fsa_recognizer.nondet_fsa_rec.generate_new_states}).
141 """
142 (current_machine_state,current_tape_index1,current_tape_index2) = search_state
143 current_input1 = string1[current_tape_index1]
144 current_input2 = string2[current_tape_index2]
145 next_tape_index1 = current_tape_index1 + 1
146 next_tape_index2 = current_tape_index2 + 1
147
148 epsilon_dict_tape1=transitions[current_machine_state]['epsilon']
149 states1 = [(machine_state,current_tape_index1,next_tape_index2) for machine_state in \
150 epsilon_dict_tape1[current_input2] if not machine_state == 'und']
151 if states1:
152 print '1: %s[eps][%s] ==> %s' % (search_state,current_input2,states1)
153
154 epsilon_dict_tape2=transitions[current_machine_state][current_input1]
155 states2 = [(machine_state,next_tape_index1,current_tape_index2) for machine_state in \
156 epsilon_dict_tape2['epsilon'] if not machine_state == 'und']
157 if states2:
158 print '2: %s[%s][eps] ==> %s' % (search_state,current_input1,states2)
159
160 try:
161 new_dict_tape1=transitions[current_machine_state][current_input1]
162 other_states = [(machine_state,next_tape_index1,next_tape_index2) for machine_state in \
163 new_dict_tape1[current_input2] if not machine_state == 'und']
164 except:
165
166 print ' %s:%s not in input alphabet' % (current_input1,current_input2)
167 other_states = []
168 states1.extend(states2)
169 states1.extend(other_states)
170 if other_states:
171 print ' %s[%s:%s] ==> %s' % (search_state,current_input1,current_input2,other_states)
172 if not states1:
173 print ' Failing search state %s: Backtracking' % (search_state,)
174 return states1
175
176
177
178
179
180
181
182
183 import fsa_recognizer.supertuple
184
185 SearchState = fsa_recognizer.supertuple.superTuplePlus('SearchState',\
186 'state',\
187 'tape_pos',\
188 'related_string')
189
190 -def apply_dn(surface_string,machine, debug=False):
191 """
192 Return the set of strings for lower langauge
193 that are paired by C{machine} with string. Return
194 the empty set if there are none.
195
196 Uses an agenda just as in the nondet recognition algorithm
197 in L{fsa_recognizer.nondet_fsa_rec.nondet_recognize}. However
198 an agenda search state now consists of a triple, a machine state and
199 a surface string position, and the lexical string constructed thus
200 far.
201
202 >>> apply_dn('baa!', transduced_sheep_language)
203 (0, 0, 0)[b:2] ==> [(1, 1, 1)]
204 (1, 1, 1)[a:1] ==> [(2, 2, 2)]
205 (2, 2, 2)[a:1] ==> [(3, 3, 3), (2, 3, 3)]
206 Failing search state (2, 3, 3): Backtracking
207 (3, 3, 3)[!:!] ==> [(4, 4, 4)]
208 {211!}
209
210 More examples:
211
212 >>> apply_dn('baa',transduced_sheep_language)
213 ...
214 {}
215 >>> apply_dn('baa!',transduced_sheep_language)
216 ...
217 {211!}
218 >>> apply_dn('baaaa!',transduced_sheep_language)
219 ...
220 {21111!}
221
222 @param surface_string: string potentially a member of surface language
223 @param transitions: an FSTTransitionTable
224 @param finals: a tuple of final states.
225 @rtype: List of strings
226 """
227 (transitions,finals) = machine
228 transitions = make_generation_transducer(transitions,debug)
229 return find_related_string(transitions,finals,surface_string,debug)
230
231
232 -def apply_up(lex_string,machine, debug=False):
233 """
234 Return the set of strings for surface langauge
235 that are paired by C{machine} with lexical string
236 C{lex_string}. Return the empty list if there are none.
237
238 Uses an agenda just as in the nondet recognition algorithm
239 in L{fsa_recognizer.nondet_fsa_rec.nondet_recognize}. However
240 an agenda search state triple now consists of a machine state,
241 a lex string position, and the surface string constructed thus
242 far.
243
244 >>> apply_up('311!',transduced_sheep_language)
245 0[3:b] ==> 1
246 1[1:a] ==> 2 b
247 2[1:a] ==> 3 ba
248 2[1:a] ==> 2 ba
249 Failing search state SearchState(2, 3, 'baa'): Backtracking
250 3[!:!] ==> 4 baa
251 ['baa!']
252
253 More examples:
254
255 >>> apply_up('112',transduced_sheep_language)
256 ...
257 []
258 >>> apply_up('211!',transduced_sheep_language)
259 ...
260 [baa!]
261 >>> apply_up('21111!',transduced_sheep_language)
262 ...
263 [baaaa!]
264
265 @param lex_string: lex string potentially a member of lexical language
266 @param transitions: an FSTTransitionTable
267 @param finals: a tuple of final states.
268 @rtype: List
269 """
270 (transitions,finals) = machine
271 transitions = make_analysis_transducer(transitions,debug)
272 return find_related_string(transitions,finals,lex_string,debug)
273
274
305
306
308 (state,input_tape_pos,_) = search_state
309 if input_tape_pos == last_index and state in finals:
310 return True
311 else:
312 return False
313
314 epsilon_print_name = 'eps'
315
316 -def add_new_search_states (pairs_list, current_input, new_tape_position,\
317 search_states, state, r_str, print_items):
318 """
319 add new search states to C{search_states} using the (new_char, new_states)
320 pairs in C{pairs_list}, current input C{current_input},
321 and the related string C{r_str} built thus far.
322
323 Also construct a list of items for printing debug info C{print_items}
324 """
325 for (new_char, new_states) in pairs_list:
326 for new_state in new_states:
327 print_items.append((state,current_input,new_char,new_state, r_str))
328 search_states.append(SearchState(state=new_state,\
329 tape_pos=new_tape_position,\
330 related_string=r_str + new_char))
331
333 """
334 Return a list of C{new_search_states} given the C{current_search_state},
335 C{transitions} from the current transducer, and a surface C{string}.
336
337 C{search_state} is a triple of::
338 (machine_state, surface_tape_position, related_string)
339 C{new_search_states} will be a list of same computed by:
340 - trying all permissible epsilon transitions for the machine
341 state in C{search_state}
342 - trying all permissible transitions for the character at
343 C{string}[C{surface_tape_position}]
344 """
345 (state,input_tape_pos,r_str) = search_state
346 try:
347 current_input = string[input_tape_pos]
348 next_tape_pos = input_tape_pos + 1
349 except IndexError:
350
351 current_input = ''
352 next_tape_pos = input_tape_pos
353 new_search_states = [ ]
354 print_items = [ ]
355
356 epsilon_pairs = transitions[state]['epsilon']
357 add_new_search_states(epsilon_pairs,epsilon_print_name,input_tape_pos,new_search_states,state,r_str,print_items)
358
359 if current_input:
360
361 try:
362 tape_advancing_pairs = transitions[state][current_input]
363 except KeyError:
364 print ' %s: not in input alphabet' % (current_input,)
365 tape_advancing_pairs = [ ]
366 else:
367 tape_advancing_pairs = [ ]
368 add_new_search_states(tape_advancing_pairs, current_input, next_tape_pos, new_search_states, state, r_str, print_items)
369
370 if new_search_states:
371 for new_item in print_items:
372 print ' %s[%s:%s] ==> %s %s' % new_item
373 if debug:
374 print ' >%s' % (new_search_states,)
375 if not new_search_states:
376 print ' Failing search state %s: Backtracking' % (search_state,)
377
378 return new_search_states
379
380
382 """
383 An FST is a list of states, each represented by its
384 transducer transition fn (a dicts of dicts):
385 - [TTF1, TTF2, ...]
386 - TTF1 = {lex_char1: D1, lex_char2:D2,... }
387 Each TTFi (Transducer Transition Function) is a function from lexical
388 chars to an FSA Transition function (a function from chars to
389 sets of states).
390
391 Each embedded dictionary, call it Di, thus represents an FSA
392 transition function.
393
394 Transduce such a list of TTFs into a different list of dictionaries
395 - [TTF1', TF2', ...]
396 - TTF1' = {lex_char1: L1, lex_char2: L2, ...}
397 Where each Li is just Di.items() [the list of key-value pairs]
398 with the undefined transitions removed::
399 Li = [pair for pair in Di.items()\
400 if state transition of pair is not undefined]
401 """
402 new_transitions = []
403 for state in transitions:
404 transition_function = {}
405 new_transitions.append(transition_function)
406 for char in state.keys():
407 transition_function[char] = \
408 [handle_eps(pair) for pair in state[char].items()\
409 if pair[1]<>('und',)]
410 print_transitions_table(new_transitions,debug)
411 return new_transitions
412
414 """
415 An FST is a list of states, each represented by its
416 transducer transition fn (a dicts of dicts):
417 - [TTF1, TTF2, ...]
418 - TTF1 = {surf_char1: D1, surf_char2:D2,... }
419 Each TTFi (Transducer Transition Function) is a function from surface
420 chars to an FSA Transition function (a function from chars to
421 sets of states).
422
423 Transduce Each TTF into a dictionary TF1';
424 - [TF1', TF2', ...]
425 - TF' = {lex_char1: L1, lex_char2: L2, ...}
426 Where each lex_chari is associated with the set of surface_chars that
427 map to it in this state and each surface char, surf_charj
428 is pairsed with the set of states that surf_charj:lex_chari map to
429 in this state.
430 """
431 new_transitions = []
432 for state in transitions:
433 transition_function = {}
434 new_transitions.append(transition_function)
435 for surf_char in state.keys():
436 for lex_char in state[surf_char].keys():
437 new_states = state[surf_char][lex_char]
438 this_pair_list = transition_function.setdefault(lex_char,[])
439 if new_states <> ('und',):
440 this_pair_list.append(handle_eps((surf_char,new_states)))
441 print_transitions_table(new_transitions,debug)
442 return new_transitions
443
445 """
446 Output epsilons are just represented as the empty
447 string. Do this transduction here.
448 """
449 states = pair[1]
450 if pair[0] == 'epsilon':
451 new_char = ''
452 else:
453 new_char = pair[0]
454 return (new_char,states)
455
456
458
465
466
467
468
469 from fsa_recognizer.sheep_language import *
470 from fsa_recognizer.read_fst_file import *
471
472 if __name__ == '__main__':
473 import os
474
475
476 os.chdir('/Users/jeanmarkgawron/python/compling_intro/fsa_recognizer/fsa_recognizer')
477 comp1 = DotFstCompiler()
478 einsertion_machine = make_fst_from_file('einsertion.dot',comp1)
479
480
481
482
483
484