forked from oshindow/Transformer-Transducer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_char_vocab.py
47 lines (37 loc) · 1.26 KB
/
get_char_vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import codecs
from rnnt.utils import init_logger
import logging
logging.basicConfig(level=logging.INFO)
def get_contents():
contents = []
with codecs.open('/home/oshindo/rnn-transducer/char.txt', 'r', encoding='utf-8') as fid:
for line in fid:
parts = line.strip().split(' ')
utt_id = parts[0]
utt_id = "".join(utt_id)
text = parts[1:]
text = "".join(text)
text = " ".join(text)
contents.append(utt_id + " " + text)
return contents
#with open('/home/oshindo/kaldi/egs/thchs30/s5/data/mfcc/train/chartext.txt', 'w') as fid:
# for line in get_contents():
# fid.write(str(line)+'\n')
def get_char():
char = []
list = []
i = 0
with codecs.open('/home/oshindo/rnn-transducer/char.txt', 'r', encoding='utf-8') as fid:
for line in fid:
parts = line.strip().split(' ')
for x in parts[1:]:
if x in list:
continue
else:
list.append(x)
char.append(x + " " + str(i))
i += 1
return char
with open('thchs30_label/thchs30_char_table.txt', 'w') as fid:
for line in get_char():
fid.write(str(line)+'\n')