-
Notifications
You must be signed in to change notification settings - Fork 44
/
oov_analyser.py
132 lines (114 loc) · 3.38 KB
/
oov_analyser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# -*- coding: utf-8 -*-
import codecs
import sys
sys = reload(sys)
sys.setdefaultencoding('utf-8')
f1 = sys.argv[1]
f_gold = sys.argv[2]
f_prediction = sys.argv[3]
sent1 = []
sent2 = []
words_t = set()
w_pos_t = set()
total_w = 0
for line in codecs.open(f1, 'rb', encoding='utf-8'):
line = line.strip()
segs = line.split(' ')
total_w += len(segs)
for seg in segs:
w_pos_t.add(seg)
sp = seg.split('_')
if len(sp) == 2:
words_t.add(sp[0])
print 'Total numbers of words in the training set: %d.' % total_w
print 'Total numbers of unique words in the training set: %d (plain), %d (POS).' % (len(words_t), len(w_pos_t))
print ''
words_g = set()
w_pos_g = set()
total_word = 0
oov_w = 0
oov_pos = 0
o_words_g = set()
o_pos_g = set()
oov_w_dics = []
oov_pos_dics = []
for line in codecs.open(f_gold, 'rb', encoding='utf-8'):
line = line.strip()
segs = line.split(' ')
total_word += len(segs)
idx = 0
dic_w = {}
dic_pos = {}
for seg in segs:
w_pos_g.add(seg)
if seg not in w_pos_t:
o_pos_g.add(seg)
oov_pos += 1
dic_pos[idx] = seg
sp = seg.split('_')
if len(sp) == 2:
words_g.add(sp[0])
if sp[0] not in words_t:
o_words_g.add(sp[0])
oov_w += 1
dic_w[idx] = sp
idx += len(sp[0])
oov_w_dics.append(dic_w)
oov_pos_dics.append(dic_pos)
print 'Total numbers of words in golden test set: %d.' % total_word
print 'Total numbers of unique words in golden test set: %d (plain), %d (POS).' % (len(words_g), len(w_pos_g))
print ''
print 'Total numbers of OOV words in golden test set: %d (plain), %d (POS).' % (oov_w, oov_pos)
print 'Unique OOV words in golden test set: %d (plain), %d (POS).' % (len(o_words_g), len(o_pos_g))
print 'Percentages of OOV words in golden test set: %f (plain), %f (POS).' % (float(oov_w)/total_word, float(oov_pos)/total_word)
print ''
idx = 0
correct_w = 0
correct_pos = 0
incorrect_w = []
incorrect_pos = []
for line in codecs.open(f_prediction, 'rb', encoding='utf-8'):
line = line.strip()
segs = line.split(' ')
idy = 0
dic_w = {}
dic_pos = {}
for seg in segs:
sp = seg.split('_')
dic_pos[idy] = seg
dic_w[idy] = sp[0]
idy += len(sp[0])
'''
for k, v in oov_pos_dics[idx].items():
if k in dic_pos:
if v == dic_pos[k]:
correct_pos += 1
else:
incorrect_pos.append(v)
'''
for k, v in oov_w_dics[idx].items():
if k in dic_w:
assert k in dic_pos
if v[0] == dic_w[k]:
correct_w += 1
if v[0] + '_' + v[1] == dic_pos[k]:
correct_pos += 1
else:
incorrect_pos.append(v[0] + '_' + v[1])
else:
incorrect_w.append(v)
incorrect_pos.append(v[0] + '_' + v[1])
idx += 1
print 'Correct predicted OOV words: %d (plain), %d (POS).' % (correct_w, correct_pos)
print 'Percentages of correct predicted OOV words: %f (plain), %f (POS).' % (float(correct_w)/oov_w, float(correct_pos)/oov_w)
print ''
'''
print 'Incorrect segmentations: '
for v in incorrect_w:
print v.encode('utf-8')
print ''
print 'Incorrect segmentations & POS tags: '
for v in incorrect_pos:
print v.encode('utf-8')
print ''
'''