-
Notifications
You must be signed in to change notification settings - Fork 0
/
use_heideltime.py
126 lines (93 loc) · 3.46 KB
/
use_heideltime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
@author: Ulrike Henny-Krahmer
@filename: use_heideltime.py
Submodule calling HeidelTime Standalone
See https://github.com/HeidelTime for more information about this temporal tagger
"""
import os
import glob
import subprocess
import sys
from lxml import etree
import re
def apply_ht(hdpath, infolder, outfolder, language="ENGLISH", outtype="TIMEML", pos="TREETAGGER"):
"""
Applies the HeidelTime Standalone version to a bunch of plain text files.
Requires HeidelTime Standalone to be installed.
Be careful: seems to need a lot of time.
Arguments:
hdpath (string): path to the HeidelTime installation
infolder (string): path to the input folder (which should contain plain text files)
outfolder (string): path to the output folder (which should exist)
language (string): indicates the language of the documents, e. g. "SPANISH", "FRENCH", "ENGLISH", defaults to English
outtype (string): type of result; "XMI" or "TIMEML", defaults to "TIMEML"
pos (string): POS Tagger; "STANFORDPOSTAGGER", "TREETAGGER" or "NO", defaults to "TREETAGGER"
"""
inpath = os.path.join(infolder, "*.txt")
filecounter = 0
# check output folders
if not os.path.exists(outfolder):
os.makedirs(outfolder)
print("Starting...")
for filepath in glob.glob(inpath):
filecounter+= 1
fn = os.path.basename(filepath)[:-4]
fnout = fn + ".xml"
command = "java -jar " + os.path.join(hdpath, "de.unihd.dbs.heideltime.standalone.jar") + " " + filepath + " -c " + os.path.join(hdpath, "config.props") + " -l " + language + " -o " + outtype + " > " + os.path.join(outfolder, fnout)
print("Treating " + fn + " ...")
subprocess.call(command, shell=True)
print("Done. " + str(filecounter) + " files treated.")
def wrap_body(infolder, outfolder):
"""
Create a body-element to wrap the heideltime annotation results for each file in the collection.
Arguments:
infolder (string): path to the annotated files
outfolder (string): path to the output folder (which should exist)
"""
inpath = os.path.join(infolder, "*.xml")
filecounter = 0
# check output folders
if not os.path.exists(outfolder):
os.makedirs(outfolder)
print("Starting...")
for filepath in glob.glob(inpath):
filecounter+= 1
fn = os.path.basename(filepath)
print(fn)
doc = etree.parse(filepath)
root = doc.getroot()
wrapper = etree.Element("wrapper")
wrapper.append(root)
result = str(etree.tostring(wrapper, pretty_print=True, encoding="unicode"))
# save the results
with open(os.path.join(outfolder, fn), "w") as output:
output.write(result)
print("Done. " + str(filecounter) + " files treated.")
def debug_ampersands(infolder, outfolder):
"""
Debug ampersands in HeidelTime-Output by replacing & with &
Arguments:
infolder (string): path to the annotated files
outfolder (string): path to the output folder (which should exist)
"""
inpath = os.path.join(infolder, "*.xml")
filecounter = 0
# check output folders
if not os.path.exists(outfolder):
os.makedirs(outfolder)
print("Starting...")
for filepath in glob.glob(inpath):
filecounter+= 1
fn = os.path.basename(filepath)
print(fn)
with open(filepath, "r", encoding="UTF-8") as infile:
text = infile.read()
result = re.sub("&", "&", text)
# save the results
with open(os.path.join(outfolder, fn), "w") as output:
output.write(result)
print("Done. " + str(filecounter) + " files treated.")
if __name__ == "__main__":
apply_ht(int(sys.argv[1]))