-
Notifications
You must be signed in to change notification settings - Fork 0
/
Ranging.py
113 lines (95 loc) · 4.27 KB
/
Ranging.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os.path
import chardet
import re
import string
from math import log2
class Ranger:
@staticmethod
def process_word(word):
return word.rstrip().strip(string.punctuation).lower()
@staticmethod
def __process_zone(zone_text):
tokens = re.split("\W+", zone_text.rstrip())
tf_index = dict()
for token in tokens:
word = Ranger.process_word(token)
if word in tf_index.keys():
tf_index[word] += 1
else:
tf_index[word] = 1
for key in tf_index:
tf_index[key] /= len(tokens)
return tf_index
def __init__(self, text_directory):
self.list_of_files = [os.path.join(text_directory, f) for f in os.listdir(text_directory) if
os.path.isfile(os.path.join(text_directory, f))]
self.common_dictionary = dict()
self.file_length = []
for file_name in self.list_of_files:
print(file_name)
encoding = chardet.detect(open(file_name, "rb").read())['encoding']
local_dict = {}
tokens_amount = 0
with open(file_name, "r", encoding=encoding) as file:
line = file.readline()
while line and len(line):
tokens = re.split("\W+", line.rstrip())
tokens_amount += len(tokens)
for t in tokens:
w = Ranger.process_word(t)
if w in local_dict:
local_dict[w] += 1
else:
local_dict[w] = 1
line = file.readline()
self.file_length.append(tokens_amount)
for w in local_dict.keys():
if w in self.common_dictionary.keys():
self.common_dictionary[w][self.list_of_files.index(file_name)] = {
'body': local_dict[w] / tokens_amount}
else:
self.common_dictionary[w] = {
self.list_of_files.index(file_name): {'body': local_dict[w] / tokens_amount}}
tf_title = Ranger.__process_zone(file_name.split("\\")[-1].strip(".txt"))
for w in tf_title:
if w in self.common_dictionary.keys():
if self.list_of_files.index(file_name) in self.common_dictionary.keys():
self.common_dictionary[w][self.list_of_files.index(file_name)]['title'] = tf_title[w]
else:
self.common_dictionary[w][self.list_of_files.index(file_name)] = {'title': tf_title[w]}
else:
self.common_dictionary[w] = {self.list_of_files.index(file_name): {'title': tf_title[w]}}
ZONE_WEIGHTS = {'title': 0.95, 'body': 0.05}
def zone_search(self, query):
scores = [0.0 for i in range(len(self.list_of_files))]
query_tokens = [Ranger.process_word(t) for t in re.split("\W+", query.rstrip())]
weights_term_query = [1.0 for i in range(len(query_tokens))]
for token_index in range(len(query_tokens)):
token = query_tokens[token_index]
if token not in self.common_dictionary.keys():
continue
posting_td = self.common_dictionary[token]
idf = log2(len(self.list_of_files) / len(posting_td))
for docid in posting_td.keys():
for zone_name in self.ZONE_WEIGHTS.keys():
if zone_name in posting_td[docid].keys():
scores[docid] += posting_td[docid][zone_name] * self.ZONE_WEIGHTS[zone_name] * idf * \
weights_term_query[token_index]
for i in range(len(scores)):
scores[i] /= self.file_length[i]
AMOUNT_OF_RESULTS_TO_RETURN = 5
res = []
for i in range(AMOUNT_OF_RESULTS_TO_RETURN):
j = scores.index(max(scores))
if scores[j] <= 0:
break
res.append(self.list_of_files[j])
scores[j] = -1
return res
if __name__ == "__main__":
ranger = Ranger("text")
while True:
req = input("\n\nEnter Ranger search request: ")
if req.lower() == "exit":
break
print(ranger.zone_search(req))