-
Notifications
You must be signed in to change notification settings - Fork 1
/
ArtDistance.py
64 lines (51 loc) · 2.22 KB
/
ArtDistance.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import xml.etree.ElementTree as ET
from collections import Counter
import os
import pandas as pd
artPos = Counter()
def proieltbs(treebank, artpos, filename):
"""Returns a Counter artpos{articleposition:frequency}."""
froot = treebank.getroot()
for source in froot:
for division in source:
for sentence in division:
for token in sentence:
if token.get('lemma') == 'ὁ' and token.get('part-of-speech') == 'S-':
if token.get('relation') == 'aux':
artdistance = int(token.get('head-id')) - int(token.get('id'))
if artdistance < 100:
artpos[artdistance] += 1
if artdistance < -100:
print(filename, token.get('id'))
# Subtracts Head-ID from Article-ID to gather distance between the two words. Adds it to counter.
return artpos
def perseustbs(treebank, artpos, filename):
"""Returns a Counter artpost{articleposition:frequency}."""
froot = treebank.getroot()
for body in froot:
for sentence in body:
for word in sentence:
if word.get('lemma') == 'ὁ' and word.get('relation') == 'ATR':
artdistance = int(word.get('head')) - int(word.get('id'))
artpos[artdistance] += 1
if artdistance > 1000:
print(word.get('id'))
# Subtracts Head-ID from Article-ID to gather distance between the two words. Adds it to counter.
return artpos
os.chdir('/home/chris/Desktop/CustomTB')
indir = os.listdir('/home/chris/Desktop/CustomTB')
for file_name in indir:
if not file_name == 'README.md' and not file_name == '.git':
tb = ET.parse(file_name)
tbroot = tb.getroot()
print(file_name)
if tbroot.tag == 'proiel':
artPos = proieltbs(tb, artPos, file_name)
if tbroot.tag == 'treebank':
artPos = perseustbs(tb, artPos, file_name)
df = pd.DataFrame.from_dict(artPos, orient='index')
outname = 'NewArtDistance.csv'
outdir = '/home/chris/Desktop'
outpath = os.path.join(outdir, outname)
df.to_csv(outpath)
print(artPos)