viterbi.read

1 #!/usr/bin/python 2

3 -def read_probs_file(filename, debug=False):

4 """ 5 Read in either, case(a), a file representing 2 parameter probs in format:: 6 p1 p2 prob 7 Or, case(b), a file containing a pronounciation dict with lines of the form:: 8 word: 9 phon1 phon2 prob 10 ... 11 phoni phonj prob 12 In Case (a) return 2-param prob dictionary:: 13 D[w1][2] = probability of w2 given w1 14 In Case (b) return 3-param prob dictionary:: 15 D[w][phon1][phon2] = probability of phon2 given phon1 when 16 pronouncing word w 17 These dictionaries are printed to files with the same base 18 as filename, but a ".py" extension. Default: extension 19 of input files will be ".txt". 20 """ 21 ## Input a text file 22 bi_dict = {} 23 pron_dict = {} 24 # Use bi_dict as the default dict 25 dispatcher = DictDispatcher(bi_dict) 26 ifh = open(filename,'r') 27 for line in ifh: 28 if line[0] == '%': # Comment char 29 continue 30 line = line.rstrip() 31 dispatcher.add_line_to_dict(line,pron_dict,debug) 32 ifh.close() 33 ## Output a python file 34 (base,ext) = os.path.splitext(filename) 35 ofh = open(base+'.py','w') 36 print >> ofh, '# Automatically generated from %s' % filename 37 if bi_dict: 38 print >> ofh, base+'_dict'+'=', bi_dict 39 else: 40 print >> ofh, base+'_dict'+'=', pron_dict 41 ofh.close() 42 return (bi_dict,pron_dict, dispatcher)

43 44 import re 45 word_re = re.compile(r'(\w+):$') 46 bigram_re = re.compile(r'(\#|\w+)\s+(\w+)\s+(\d?\.\d+)$') 47 blank_re = re.compile(r'\w*$') 48 start_re = re.compile(r'(\#)') 49

50 -class DictDispatcher(object):

51 - def __init__(self,active_dictionary={}):

52 object.__init__(self) 53 self.active_dictionary=active_dictionary

54

55 - def add_line_to_dict(self,line, pron_dict, debug):

56 if debug: 57 print line 58 word_match = re.match(word_re,line) 59 bigram_match = re.match(bigram_re,line) 60 if word_match: 61 'Word dict detected!' 62 self.active_dictionary=pron_dict.setdefault(word_match.groups()[0],{}) 63 elif bigram_match: 64 w1 = bigram_match.groups()[0] 65 if re.match(start_re, w1): 66 w1 = 'start' 67 w1_dict = self.active_dictionary.setdefault(w1, {}) 68 print bigram_match.groups()[2] 69 w1_dict[bigram_match.groups()[1]] = float(bigram_match.groups()[2]) 70 else: 71 if not re.match(blank_re,line): 72 print 'Unprocessed line %s' % line

73 74 if __name__ == '__main__': 75 import sys, os 76 files = sys.argv[1:] 77 for file_str in files: 78 print 'Processing %s' % file_str 79 print 80 read_probs_file (file_str,True) 81

Source Code for Module viterbi.read_probs