1 """
2 This module contains code for turning different
3 file representations of FSTs into Python FST reps:
4 a list of dictionaries (with upper-lg keys) of dictionaries
5 (with lower lgs keys). These can then be
6 used by the transducer generation and analysis
7 code in L{fsa_recognizer.fta_rec}.
8
9 The toplevel function is L{read_fst_file.make_fst_from_file},
10 which takes a compiler instance (particular
11 to the format of the file being read) and
12 a filename argument, returning an FST.
13
14 Implemented formats include ATT graphviz ("dot") files
15 and xerox xfst ".net" files.
16 """
17
18 import re
19
20
21
22
23
24
25
27 """
28 Open source file C{file}, call the appropriate
29 compiler C{CompilerInst}, and turn it into a Python dictionary based
30 represntation of an FST, suitable for the code in
31 in L{fsa_recognizer.fta_rec}.
32
33 Assumptions for the dot file:
34 - node names are string reps of integers
35 - The node numbered 0 is the start node
36 - epsilon rep is as defined in upper/lower
37 char_map.
38
39 Assumptions for FST:
40 - upper language comes FIRST in state-transition fns (STFs):
41 That is, an STF is a function from upper-language chars
42 to functions from lower lang chars to state-sets
43 (FSA transition functions on the lower language).
44
45 @param file: string (filename path)
46 @rtype fst (pair of dictionary and tuple)
47 """
48 (_, raw_dictionary) = Compiler.read_file(file)
49 feasible_pairs = get_feasible_pairs(raw_dictionary, Compiler)
50 return make_fst(feasible_pairs, raw_dictionary, Compiler)
51
52 -def make_fst(feasible_pairs,raw_dictionary, Compiler):
53 """
54 We assume states are string reps of ints and
55 the state numbered 0 is always initial state.
56
57 We create an FST whose state dictionaries are
58 all totally defined for the set of feasiable
59 pairs, filling in the gaps in raw_dictionary with
60 transitions to ('und',).
61
62 We sort the dictionary items according to the individual
63 compiler's cmp fn, assuming that the start state will
64 get ranked first.
65
66 We assume every state fn in raw_dic is a dictionary.
67 Two keys are special:
68
69 - 'final' takes a boolean value specifying whether this
70 is a final state.
71 - 'start' takes a boolean value specifying whether this
72 is a start state. (not used)
73
74 All other keys are upper lg chars whose values are
75 dictionaries whose keys are lower lg chars::
76
77 raw_dic[state][upper][lower] = the set of states transitioned to
78 in teh state C{state} when upper
79 char C{char} is paired with lower lg
80 char C{lower}
81 """
82 raw_items = raw_dictionary.items()
83 raw_items.sort(Compiler.cmp_item)
84 transitions = []
85 finals = []
86 for item in raw_items:
87 state_dic = {}
88 transitions.append(state_dic)
89 if item[1]['final']:
90 finals.append(item[0])
91 for pair in feasible_pairs:
92 upper_char_dic = state_dic.setdefault(Compiler.upper_char_map[pair[0]],{})
93 upper_char_raw_dic = item[1].get(pair[0],{})
94 new_states = upper_char_raw_dic.get(pair[1], ('und',))
95 upper_char_dic[Compiler.lower_char_map[pair[1]]] = new_states
96 return (transitions, tuple(finals))
97
98
100 """
101 Assumptions for FST:
102 - The set of feasible pairs includes some initial
103 set passed in as C{initial_freasible_pairs}. Usually
104 this is the identity mapping for all lower case
105 alphabetic characters.
106 - All other possible mappings are instantiated
107 in the raw dictionary as pairings of chars
108 in the upper and lower alphabet.
109 Search the raw_dic rep of the FST,
110 adding any new pairs found to the initial mapping.
111
112 @param raw_dic: a dictionary
113 @rtype: a set (of pairs of chars)
114 """
115 res = Compiler.feasible_pairs
116 for transition_func in raw_dic.values():
117 for upper_char in transition_func.keys():
118
119 if upper_char not in ['start','final']:
120
121 add_to_char_map_if_necessary(Compiler.upper_char_map,\
122 upper_char)
123 for lower_char in transition_func[upper_char].keys():
124 add_to_char_map_if_necessary(Compiler.lower_char_map,\
125 lower_char)
126 res.add((upper_char,lower_char))
127
128 for item in Compiler.upper_char_map.items():
129 if item[1] == 'epsilon':
130 upper_zero = item[0]
131
132 for item in Compiler.lower_char_map.items():
133 if item[1] == 'epsilon':
134 lower_zero = item[0]
135
136
137 res.add((upper_zero,lower_zero))
138 return res
139
141 try:
142 char_map[char]
143 except KeyError:
144 char_map[char] = char
145
147 """
148 Expand a char interval rep like a..z
149 into a list of characters.
150
151 @param low: character the lower bound character in the interval
152 @param high: character the upper bound character in the interval
153 @rtype: list of characters
154 """
155
156 code_range = range( ord(low),ord(high)+1)
157 char_list = []
158 for code in code_range:
159 char_list.append(chr(code))
160 return char_list
161
163 line = line.strip()
164 if re.match(blank_line_re, line):
165 return True
166 else:
167 return False
168
169
170
171
172
173
174 alphabet = 'abcdefghijklmnopqrstuvwxyz'
175
176
177
178
179
180
181
183 """
184 Class for compiling FST representations in various file
185 formats into equivalent raw Python dictionaries such that::
186
187 D[state][upper_char][lower_char] = set of states
188
189 The intent of this 'raw' dictionary representation is to be a
190 modular portable representation of the FST transitions and
191 final and initial state. Alphabet and feasible_pair info is
192 stored on the compiler instance.
193
194 The raw dictionary is not suitable for the code in L{fsa_recognizer.fta_rec}
195 because it does not define a total functions on the alphabet.
196 To produce dictionaries of that form call L{fsa_recognizer.fta_rec.make_fst_from_file}, which calls an instance of this class for initial
197 file processing and then fills out the partial transition functions.
198
199 Default assumptions for FST:
200 - The set of feasible pairs includes the identity
201 mapping for all lower case alphabetic characters
202 - characters in the files are mapped to themselves
203 in the Python FST (upper/lower)_char_map
204 - default mapping to be extended case by case.
205 """
206
207
208
209
210
211
212
217
219 """
220 Abstract interface for generic read file fn.
221
222 Open FST file C{file} and return C{raw_dic}, a
223 dictionary representation of the FST represented
224 there.
225
226 The keys of the dictionary are state names,
227 possibly renamed from the original representation.
228 C{raw_dic}[state_name] is a transition function for
229 that state represented as a dictionary.
230
231 Two keys are special in transition functions:
232
233 - 'final' takes a boolean value specifying whether this
234 is a final state.
235 - 'start' takes a boolean value specifying whether this
236 is a start state. (not used)
237
238 All other keys in transition fns are upper lg chars whose values are
239 dictionaries whose keys are lower lg chars::
240
241 raw_dic[state][upper][lower] = the set of states transitioned to
242 in the state C{state} when upper
243 char C{char} is paired with lower lg
244 char C{lower}
245
246
247 Implementations of this method should always call
248 initialize reader first, to zero out all the
249 file-specific attributes of C{self}.
250
251 @param file: file containing some representation of an FST.
252 @rtype: dictionary of dictionary of dictionaries
253 """
254 print 'Abstract method to be implemented by subclasses'
255
257 """
258 Zero out all file specific attributes before
259 compiling a new file.
260
261 Called by all implementations of C{read_file}.
262 """
263 print 'Abstract method to be implemented by subclasses'
264
265
266
267
268
269 graph_type_re = re.compile(r'digraph')
270 blank_line_re = re.compile(r'\s+$')
271 node_dec_re = re.compile(r'\s*node\s*\[(.*)\]\s*;')
272 node_name_re = re.compile(r'\s*(\w+)\s*\[(.*)\]\s*;')
273 node_connection_re = re.compile(r'\s*(\w+)\s*->\s*(\w+)\s*\[(.*)\]\s*;')
274
275 label_re = re.compile(r'[ ,\{\}]')
276 char_interval_re = re.compile(r'(.)\.\.(.)')
277
278 att_val_re = re.compile(r'\s*\,?(\w+)\s*=\s*(\d+)')
279
280
281
282 att_val_re2 = re.compile(r'\s*\,?(\w+)\s*=\s*"(.+?)"')
283
284
285
286
287
289 """
290 Code for compiling an ATT graphviz ("dot")
291 representation of an FST into a Python FST rep.
292 """
293
295 FstCompiler.__init__(self)
296
297 self.upper_char_map.update({'[]':'epsilon', '(+)':'+', '(#)':'#'})
298 self.lower_char_map.update({'[]':'epsilon'})
299
301 """
302 This function zeroes out all the records kept
303 during the reading of a fst file so that the same
304 compiler can be used on a new file safely.
305 Shd be called by read_file.
306 """
307 self.alphabet = set([])
308 self.node_dic = {}
309 self.gtype = ''
310
311
312
314 """
315 Open dot file C{file} and return C{raw_dic}, a
316 dictionary representation of the FST represented
317 there.
318
319 We retain the state names in the dot file, but
320 we assume they are string reps of integers, an
321 assumption not b uilt into dot-files.
322
323 See interface documentation for Class FstCompiler
324 method read_file. This method respects that interface.
325
326 @param file: file containing dot representation of an FST.
327 @rtype: dictionary of dictionary of dictionaries
328 """
329 self.initialize_reader()
330 dot_file = open(file,'r')
331 lines = dot_file.readlines()
332 dot_file.close()
333 self.gtype = self.find_graph_type(lines)
334 self.node_dic = {}
335
336 for line in lines:
337 node_dec_match = re.match(node_dec_re,line)
338 node_name_match = re.match(node_name_re,line)
339 node_connection_match = re.match(node_connection_re,line)
340 if node_dec_match:
341 pass
342 elif node_name_match:
343 node_name = node_name_match.groups()[0]
344 node_atts = node_name_match.groups()[1]
345 att_dic = self.process_atts(node_atts)
346 self.classify_node(node_name,att_dic, self.node_dic)
347 elif node_connection_match:
348 src_node_name = node_connection_match.groups()[0]
349 tgt_node_name = node_connection_match.groups()[1]
350 node_atts = node_connection_match.groups()[2]
351 att_dic = self.process_atts(node_atts)
352 self.classify_link(src_node_name,tgt_node_name,\
353 att_dic,self.node_dic)
354 return (self.gtype, self.node_dic)
355
357 """
358 Items are pairs of state names and transition fns.
359
360 We assume state names are string reps of ints and
361 that the starting state has been named '0'.
362 """
363 return cmp(int(item1[0]),int(item2[0]))
364
366 """
367 Determine whether node_name is a start state
368 and whether it is a final state. Update C{node_dic}
369 with this info.
370
371 Assume node names are string reps of integers,
372 but map them to int keys in C{node_dic}
373
374 @param node_name: string
375 @param att_dic: dictionary (of attribute value pairs)
376 @param node_dic: dictionary (the FST we are updating)
377 @rtype: None
378 """
379 char_dic = node_dic.setdefault(int(node_name),{})
380 char_dic['start'] = self.is_start_node(att_dic)
381 char_dic['final'] = self.is_final_node(att_dic)
382
383 - def classify_link(self,src_node_name, tgt_node_name, att_dic, node_dic):
384 """
385 Turn the label in a dot file string rep into
386 a series of transitions in the Python FST we are building.
387
388 C{src_node_name} is the name of the state whose transition
389 function we will be updating.
390
391 Assume node names are string reps of integers,
392 but map them to int keys in C{node-dic}
393
394 @param src_node_name: string
395 @param tgt_node_name: string
396 @param att_dic: dic represention of att value pairs attached to
397 the transition from src to tgt.
398 @param node_dic: the dictionary representation of the
399 FST to be updated
400 """
401 label_string = att_dic['label']
402 label_set0 = label_re.split(label_string)
403 label_set = filter(lambda y: y <> '', label_set0)
404 src_node_dic = node_dic.setdefault(int(src_node_name),{})
405 for l in label_set:
406 char_interval_match = re.match(char_interval_re, l)
407 try:
408 (upper_char,lower_char) = l.split(':')
409 except ValueError:
410 upper_char = lower_char = l
411 if char_interval_match:
412 for char in expand_char_interval(char_interval_match.groups()[0],\
413 char_interval_match.groups()[1]):
414 upper_char_dic = src_node_dic.setdefault(char,{})
415 target_nodes = upper_char_dic.setdefault(char,())
416 upper_char_dic[char]=target_nodes+(int(tgt_node_name),)
417 else:
418 upper_char_dic = src_node_dic.setdefault(upper_char,{})
419 target_nodes = upper_char_dic.setdefault(lower_char,())
420 upper_char_dic[lower_char]=target_nodes+(int(tgt_node_name),)
421
422
424 green = False
425 try:
426 if att_dic['fillcolor'] == 'green':
427 green = True
428 elif att_dic['fillcolor'] == 'purple':
429 green = True
430 except:
431 green = False
432 try:
433 if att_dic['comment'] == 'start':
434 green = True
435 except:
436 pass
437 return green
438
440 red = False
441 try:
442 if att_dic['fillcolor'] == 'red':
443 red = True
444 elif att_dic['fillcolor'] == 'purple':
445 red = True
446 except:
447 red = False
448 return red
449
451 """
452 Turn a string rep of node or transition attributes from
453 a dot file into a dictionary with keys = to attributes.
454
455 @param att_string: string
456 @rtype: dictionary
457 """
458 equations = att_string
459 att_val_dic = {}
460 while equations:
461 att_val_match = re.match(att_val_re,equations)
462 if not att_val_match:
463 att_val_match = re.match(att_val_re2,equations)
464 if att_val_match:
465 att_val_dic[att_val_match.groups()[0]] =\
466 att_val_match.groups()[1]
467 equations = equations[att_val_match.end():]
468 else:
469 equations = ''
470 return att_val_dic
471
473 while lines:
474 try:
475 graph_type = lines[0]
476 except IndexError:
477 print 'No graph found!'
478 match_obj = re.match(graph_type_re, graph_type)
479 if match_obj:
480 del lines[0]
481 return match_obj.group()
482 elif blank_line(lines[0]):
483 del lines[0]
484 else:
485 print 'Unknown graph type!'
486 return None
487
488
489
490
491
492
493
494 flags_re = re.compile(r'Flags:(.*)$')
495 net_dec_re = re.compile(r'Net:')
496 sigma_dec_re = re.compile(r'Sigma:(.*)$')
497 size_dec_re = re.compile(r'Size:\s*(\d+)\,')
498 arity_dec_re = re.compile(r'Arity:\s*(\d+)')
499 state_desc_re = re.compile(r'(.?s\d+):(:?\s*(.+)\s*->\s*(.+)\.|\s*\(no arcs\))')
500
501
502
503
504
506 """
507 Code for compiling an Xerox xfst
508 representation of an FST into a Python FST rep.
509 """
510
512 FstCompiler.__init__(self)
513
514 self.upper_char_map.update({'0':'epsilon'})
515 self.lower_char_map.update({'0':'epsilon'})
516
517 self.transition_pairs_dic = {}
518
520 """
521 This function zeroes out all the records kept
522 during the reading of a fst file so that the same
523 compiler can be used on a new file safely.
524 Shd be called by read_file.
525 """
526 self.alphabet = set([])
527 self.node_dic = {}
528 self.gtype = ''
529 self.transition_pairs_dic = {}
530 self.name_mapping = {}
531
532
533
535 """
536 Open xerox ".net" file C{file} and return C{raw_dic}, a
537 dictionary representation of the FST represented
538 there.
539
540 We retain the state names in the net file,
541 which determine which is a start state ('s0')
542 and which are final states (state names beginning
543 with 'fs'.
544
545 See interface documentation for Class FstCompiler
546 method read_file. This method respects that interface.
547
548 @param file: file containing xfst representation of an FST.
549 @rtype: dictionary of dictionary of dictionaries
550 """
551 self.initialize_reader()
552 net_file = open(file,'r')
553 lines = net_file.readlines()
554 net_file.close()
555 self.gtype = 'digraph'
556
557 print self.node_dic
558 print self.transition_pairs_dic
559 for line in lines:
560 print line
561 for line in lines:
562 flags_match = re.match(flags_re,line)
563 net_dec_match = re.match(net_dec_re,line)
564 sigma_dec_match = re.match(sigma_dec_re,line)
565 size_dec_match = re.match(size_dec_re,line)
566 arity_dec_match = re.match(arity_dec_re,line)
567 state_desc_match = re.match(state_desc_re,line)
568 if flags_match or net_dec_match:
569 pass
570 elif sigma_dec_match:
571 self.alphabet = set(sigma_dec_match.groups()[0].split())
572 self.alphabet = set(filter(lambda y: y <> '', self.alphabet))
573
574 if '?' in self.alphabet:
575 self.alphabet.remove('?')
576
577 self.other = set(alphabet) - self.alphabet
578
579 self.alphabet.update(alphabet)
580 elif size_dec_match:
581 self.size = int(size_dec_match.groups()[0].strip())
582 elif arity_dec_match:
583 self.arity = int(arity_dec_match.groups()[0].strip())
584 elif state_desc_match:
585 state = state_desc_match.groups()[0]
586 (StartState,FinalState,NewName) = self.process_state(state)
587 transition_function = self.node_dic.setdefault(NewName,{})
588 transition_function['start'] = StartState
589 transition_function['final'] = FinalState
590 self.name_mapping[state] = NewName
591 transition_set = self.transition_pairs_dic.setdefault(NewName,set([]))
592 transitions = state_desc_match.groups()[1]
593 transition_set.update(self.process_transitions(state,transitions))
594
595
596
597
598 self.classify_links(self.node_dic)
599 return (self.gtype, self.node_dic)
600
602 final = False
603 start = False
604 if state.startswith('fs'):
605 final = True
606 newstring = state[2:]
607 if state.startswith('s'):
608 newstring = state[1:]
609 if int(newstring) == 0:
610 start = True
611 return (start, final, int(newstring))
612
614 return cmp(item1[0], item2[0])
615
617 """
618 C{transitions} a string of comma separated state transitions
619 of the form::
620
621 word -> state
622
623 @rtype: a set of pairs of word and state.
624 """
625 res = set([])
626 for transition in transitions.split(','):
627 pair = transition.split('->')
628 if len(pair) == 2:
629 res.add((pair[0].strip('. '),pair[1].strip('. ')))
630 elif re.search(r'no arcs', transition):
631 pass
632 else:
633 print 'Unknown transition for %s: %s' % (state,transition)
634 return res
635
637 """
638 Add info to node dic for state and self.transition_pairs[state]
639
640 This is called after all state declarations have been read in
641 and a name mapping assigned in self.name_mapping.
642
643 @param node_dic: the dictionary representation of the
644 FST to be updated
645 """
646 print node_dic
647 print node_dic.keys()
648 print self.transition_pairs_dic
649 for state in node_dic.keys():
650 src_state_dic = node_dic[state]
651 for pair in self.transition_pairs_dic[state]:
652 if pair[0] == '?':
653 expanded_pairs = []
654 for c in self.other:
655 expanded_pairs.append((c,pair[1]))
656 else:
657 expanded_pairs = [pair]
658 for p in expanded_pairs:
659 try:
660 (upper_char,lower_char) = p[0].split(':')
661 except ValueError:
662 upper_char = lower_char = p[0]
663 upper_char_dic = src_state_dic.setdefault(upper_char,{})
664 target_nodes = upper_char_dic.setdefault(lower_char,())
665 upper_char_dic[lower_char]=\
666 target_nodes+(int(self.name_mapping[p[1]]),)
667