-
Notifications
You must be signed in to change notification settings - Fork 3
/
utterances.py
123 lines (102 loc) · 5.48 KB
/
utterances.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#######################################
## SPADE utterance extraction script ##
#######################################
## Processes and extracts start-times and end-times for all speaker utterances.
## Used for extracting data collected as part of the SPeech Across Dialects of English
## (SPADE) project.
## Input:
## - corpus name (e.g., Buckeye, SOTC)
## - corpus metadata (stored in a YAML file), which
## specifies the path to the audio, transcripts and metadata
## Output:
## - CSV of utterance metadata for the corpus
import yaml
import time
from datetime import datetime
import sys
import os
import argparse
base_dir = os.path.dirname(os.path.abspath(__file__))
script_dir = os.path.join(base_dir, 'Common')
sys.path.insert(0, script_dir)
import common
from polyglotdb import CorpusContext
from polyglotdb.utils import ensure_local_database_running
from polyglotdb.config import CorpusConfig
from polyglotdb.io.enrichment import enrich_lexicon_from_csv
def utterance_export(config, corpus_name, corpus_directory, dialect_code, speakers, ignored_speakers=None):
## Main duration export function. Collects durational information into query format
## and outputs CSV file of measures
csv_path = os.path.join(base_dir, corpus_name, '{}_utterances.csv'.format(corpus_name))
with CorpusContext(config) as c:
print("Beginning utterance export")
beg = time.time()
## Process stress information for the vowel. All vowels in this analysis
## should contain primary stress, and so filter for stressed based on
## either the list of stressed vowels defined in the YAML file, or those
## which have had a primary stress label applied during lexical enrichment.
q = c.query_graph(c.utterance).filter(c.utterance.speaker.name.not_in_(ignored_speakers))
q = q.columns(c.utterance.speaker.name.column_name('speaker'),
c.utterance.id.column_name('utterance_label'),
c.utterance.begin.column_name('utterance_begin'),
c.utterance.end.column_name('utterance_end'),
c.utterance.following.begin.column_name('following_utterance_begin'),
c.utterance.following.end.column_name('following_utterance_end'),
c.utterance.speech_rate.column_name('speech_rate'),
c.utterance.discourse.name.column_name('discourse'),
c.utterance.discourse.speech_begin.column_name('discourse_begin'),
c.utterance.discourse.speech_end.column_name('discourse_end'))
for sp, _ in c.hierarchy.speaker_properties:
if sp == 'name':
continue
q = q.columns(getattr(c.utterance.speaker, sp).column_name(sp))
## Write the query to a CSV file
print("Writing CSV")
q.to_csv(csv_path)
end = time.time()
time_taken = time.time() - beg
print('Query took: {}'.format(end - beg))
print("Results for query written to " + csv_path)
common.save_performance_benchmark(config, 'utterance_export', time_taken)
## Process command-line arguments (corpus metadata, corpus reset, etc).
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('corpus_name', help='Name of the corpus')
parser.add_argument('-r', '--reset', help="Reset the corpus", action='store_true')
parser.add_argument('-d', '--docker', help="This script is being called from Docker", action='store_true')
args = parser.parse_args()
corpus_name = args.corpus_name
reset = args.reset
docker = args.docker
directories = [x for x in os.listdir(base_dir) if os.path.isdir(x) and x != 'Common']
if args.corpus_name not in directories:
print(
'The corpus {0} does not have a directory (available: {1}). Please make it with a {0}.yaml file inside.'.format(
args.corpus_name, ', '.join(directories)))
sys.exit(1)
corpus_conf = common.load_config(corpus_name)
print('Processing...')
# sanity check database access
common.check_database(corpus_name)
ignored_speakers = corpus_conf.get('ignore_speakers', [])
stressed_vowels = corpus_conf.get('stressed_vowels', [])
if reset:
common.reset(corpus_name)
ip = common.server_ip
if docker:
ip = common.docker_ip
## start processing the corpus
with ensure_local_database_running(corpus_name, port=common.server_port, ip=ip, token=common.load_token()) as params:
config = CorpusConfig(corpus_name, **params)
config.formant_source = 'praat'
# Common set up
## Check if the corpus already exists as a database: if not, import the audio and
## transcripts and store in graph format
common.loading(config, corpus_conf['corpus_directory'], corpus_conf['input_format'])
## Add information to the corpus regarding lexical, speaker, and linguistic information
common.lexicon_enrichment(config, corpus_conf['unisyn_spade_directory'], corpus_conf['dialect_code'])
common.speaker_enrichment(config, corpus_conf['speaker_enrichment_file'])
common.basic_enrichment(config, corpus_conf['vowel_inventory'] + corpus_conf['extra_syllabic_segments'], corpus_conf['pauses'])
## Call the utterance export function, as defined above
utterance_export(config, corpus_name, corpus_conf['corpus_directory'], corpus_conf['dialect_code'], corpus_conf['speakers'], ignored_speakers=ignored_speakers)
print('Finishing up!')