forked from abasirat/principal_word_vectors
-
Notifications
You must be signed in to change notification settings - Fork 0
/
conllu2context.py
66 lines (48 loc) · 1.46 KB
/
conllu2context.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#
# This program is written by Ali Basirat [email protected] as part of the
# project Principla Word Vectors at http://urn.kb.se/resolve?urn=urn:nbn:se:uu:diva-353866
# You are allowed to modify or distribute it if you keep this header part
#
# It can be used to generated an annotated corpus form a conllu format file (e.g., a UD corpus)
#
import sys
infile = sys.argv[1]
AND = "_"
OR = ","
LOWER_CASE=1
with open(infile,'r') as fp :
line = fp.readline().rstrip()
items = []
while line :
line = line.lstrip()
#print(line)
if len(line) == 0 :
print("")
line = fp.readline().rstrip()
for item in items :
ID,word,pos,pid,drel = item
ipid = int(pid)
feat = pos + AND + drel
if ipid > 0 :
feat += AND + items[int(pid)-1][2] + AND + items[int(pid)-1][4]
if LOWER_CASE:
word = word.lower()
print("{0}\t{1}\t{2}\t{3}".format(ID,word,pid,feat))
items = []
continue
if (line[0] == '#'):
line = fp.readline().rstrip()
continue
line = line.rstrip().lstrip()
toks = line.split()
ID = toks[0] ;
word = toks[1] ;
pos = toks[3] ;
pid = toks[6] ;
drel = toks[7].split(':')[0] ;
pos = pos.replace(',',"COMMA")
feat = pos + '_' + drel
if ID.isnumeric() :
#print("{0}\t{1}\t{2}\t{3}".format(ID,word,pid,feat))
items.append((ID,word,pos,pid,drel))
line = fp.readline()#.rstrip()