-
Notifications
You must be signed in to change notification settings - Fork 3
/
run_all_corpora.py
47 lines (39 loc) · 1.5 KB
/
run_all_corpora.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import os
import argparse
import subprocess
parser = argparse.ArgumentParser()
parser.add_argument("corpusdir", help = "Path to the directory containing corpus directories")
parser.add_argument("script", help = "name of the script to be run")
args = parser.parse_args()
## lists of corpora to skip
## and failed to run
skipped = ['spade-Penn-Neighborhood']
failed = []
## first check that the script exists
assert(os.path.isfile(args.script), "{} should be a script that exists".format(args.script))
## get the corpora from the directory
corpora = [f for f in sorted(os.listdir(args.corpusdir)) if f.startswith("spade-")]
## loop through corpus files
for corpus in corpora:
## check if the file is actually a directory since that is the expected format for the
## analysis scripts
if os.path.isdir(corpus):
if corpus in skipped:
print("Skipping {}".format(corpus))
continue
try:
print("Processing {}".format(corpus))
## first reset the corpus
subprocess.call(['python', 'reset_database.py', corpus])
## run the script on the corpus
subprocess.call(['python', args.script, corpus, "-s"])
## reset corpus afterwards to save memory
try:
subprocess.call(['python', 'reset_database.py', corpus])
except:
continue
except:
failed.append(corpus)
continue
print("Complete!")
print("Following corpora were not run: {}", failed)