-
Notifications
You must be signed in to change notification settings - Fork 3
/
autovot.py
154 lines (134 loc) · 6.44 KB
/
autovot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import sys
import os
import argparse
import sys
import yaml
base_dir = os.path.dirname(os.path.abspath(__file__))
import polyglotdb.io as pgio
from polyglotdb.utils import ensure_local_database_running
from polyglotdb.config import CorpusConfig
from polyglotdb import CorpusContext
from Common import common
def load_config(corpus_name):
path = os.path.join(base_dir, corpus_name, '{}.yaml'.format(corpus_name))
if not os.path.exists(path):
print('The config file for the specified corpus does not exist ({}).'.format(path))
sys.exit(1)
expected_keys = ['corpus_directory', 'input_format', 'dialect_code', 'unisyn_spade_directory',
'speaker_enrichment_file',
'speakers', 'vowel_inventory', 'stressed_vowels', 'sibilant_segments'
]
with open(path, 'r', encoding='utf8') as f:
conf = yaml.load(f)
missing_keys = []
for k in expected_keys:
if k not in conf:
missing_keys.append(k)
##### JM #####
if not 'vowel_prototypes_path' in conf:
conf['vowel_prototypes_path'] = ''
print('no vowel prototypes path given, so using no prototypes')
elif not os.path.exists(conf['vowel_prototypes_path']):
conf['vowel_prototypes_path'] = ''
print('vowel prototypes path not valid, so using no prototypes')
##############
if missing_keys:
print('The following keys were missing from {}: {}'.format(path, ', '.join(missing_keys)))
sys.exit(1)
return conf
def loading(config, corpus_dir, textgrid_format):
with CorpusContext(config) as c:
exists = c.exists()
if exists:
print('Corpus already loaded, skipping import.')
return
if not os.path.exists(corpus_dir):
print('The path {} does not exist.'.format(corpus_dir))
sys.exit(1)
with CorpusContext(config) as c:
print('loading')
if textgrid_format == "buckeye":
parser = pgio.inspect_buckeye(corpus_dir)
elif textgrid_format == "csv":
parser = pgio.inspect_buckeye(corpus_dir)
elif textgrid_format.lower() == "fave":
parser = pgio.inspect_fave(corpus_dir)
elif textgrid_format == "ilg":
parser = pgio.inspect_ilg(corpus_dir)
elif textgrid_format == "labbcat":
parser = pgio.inspect_labbcat(corpus_dir)
elif textgrid_format == "partitur":
parser = pgio.inspect_partitur(corpus_dir)
elif textgrid_format == "timit":
parser = pgio.inspect_timit(corpus_dir)
else:
parser = pgio.inspect_mfa(corpus_dir)
c.load(parser, corpus_dir)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('corpus_name', help='Name of the corpus')
parser.add_argument('classifier', help='Path to classifier')
parser.add_argument('-r', '--reset', help="Reset the corpus", action='store_true')
parser.add_argument('-e', '--export_file', help='Path of CSV to export')
parser.add_argument('-v', '--vot', help='Reset and re-encode VOT', action='store_true',default=False)
args = parser.parse_args()
corpus_name = args.corpus_name
classifier = args.classifier
reset = args.reset
vot = args.vot
directories = [x for x in os.listdir(base_dir) if os.path.isdir(x) and x != 'Common']
if args.corpus_name not in directories:
print(
'The corpus {0} does not have a directory (available: {1}). Please make it with a {0}.yaml file inside.'.format(
args.corpus_name, ', '.join(directories)))
sys.exit(1)
corpus_conf = load_config(corpus_name)
print('Processing...')
#Connect to local database at 8080
with ensure_local_database_running(corpus_name, port=8080, token = common.load_token()) as params:
#Load corpus context and config info
config = CorpusConfig(corpus_name, **params)
config.formant_source = 'praat'
# Common set up
if reset:
with CorpusContext(config) as c:
print("Resetting the corpus.")
c.reset()
common.loading(config, corpus_conf['corpus_directory'], corpus_conf['input_format'])
common.lexicon_enrichment(config, corpus_conf['unisyn_spade_directory'], corpus_conf['dialect_code'])
common.speaker_enrichment(config, corpus_conf['speaker_enrichment_file'])
common.basic_enrichment(config, corpus_conf['vowel_inventory'] + corpus_conf['extra_syllabic_segments'], corpus_conf['pauses'])
with CorpusContext(config) as g:
#Sets of stops and vowels
stops = ['p', 't', 'k']
vowels = corpus_conf['vowel_inventory']
#If there is already a stop subset in the database, delete it
if g.hierarchy.has_token_subset('phone', "stops"):
g.query_graph(g.phone).remove_subset("stops")
#Encode a subset of word initial stops spoken by a speaker in small_speakers
q = g.query_graph(g.phone)
#q = q.filter(g.phone.speaker.name.in_(small_speakers)).filter(g.phone.begin==g.phone.word.begin).filter(g.phone.label.in_(stops))
q = q.filter(g.phone.begin==g.phone.word.begin).filter(g.phone.label.in_(stops)).filter(g.phone.following.label.in_(vowels))
q.create_subset('stops')
#Ensure utterances are encoded and encoded them if not.
if not 'utterance' in g.annotation_types:
g.encode_pauses(corpus_conf["pauses"])
g.encode_utterances(min_pause_length=0.15)
#Reset and predict VOT values
if vot:
g.reset_vot()
g.analyze_vot(stop_label='stops',
classifier=classifier,
vot_min=15,
vot_max=250,
window_min=-30,
window_max=30)
#Get a query of necessary info
q = g.query_graph(g.phone).filter(g.phone.subset == "stops").columns(g.phone.label, \
g.phone.begin, g.phone.end, g.phone.vot.confidence, \
g.phone.vot.begin, g.phone.vot.end, g.phone.word.label, g.phone.syllable.stress,\
g.phone.discourse.name, g.phone.speaker.name).order_by(g.phone.begin)
if args.export_file:
q.to_csv(args.export_file)
else:
q.to_csv(os.path.join(base_dir, corpus_name, '{}_vot.csv'.format(corpus_name)))