-
Notifications
You must be signed in to change notification settings - Fork 0
/
SPIMI.py
118 lines (104 loc) · 4.25 KB
/
SPIMI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
from sys import getsizeof
from lib import Token_Stream
from time import process_time
BLOCK_SIZE = 2000000 # 10000000
def __full(dictionary, blockSize):
return len(dictionary) > blockSize
def spimi_invert(token_stream, output_file):
with open(output_file, "w", encoding="utf8") as output_file:
dictionary = {}
words_processed = 0
while words_processed < BLOCK_SIZE:
# while not __full(dictionary, BLOCK_SIZE):
try:
token, docid = token_stream.next_token()
except StopIteration:
print("StopIteration")
start = process_time()
for key in sorted(dictionary.keys()):
output_file.write(key + " " + str(dictionary[key]) + "\n")
del dictionary
end = process_time()
print("Dictionary written in", end - start, "s")
return True
if token not in dictionary:
dictionary[token] = []
postings_list = dictionary[token]
if not len(postings_list) or docid != postings_list[-1]:
# if docid not in postings_list:
postings_list.append(docid)
words_processed += 1
start = process_time()
print("DICTIONARY FULL. Writing to Disk")
for key in sorted(dictionary.keys()):
output_file.write(key + " " + str(dictionary[key]) + "\n")
del dictionary
end = process_time()
print("Dictionary written in", end - start, "s")
print("SPIMI")
return False
def __parse_line(line):
if line == '':
return None
word, arr = line[:line.index(' ')], line[line.index(' ') + 1:].strip('[').strip(']\n')
res_list = [int(i) for i in arr.split(',')]
return word, res_list
def merge_files(blocks_directory, output_file="spimi_final.txt"):
files_to_merge = [os.path.join(blocks_directory, f) for f in os.listdir(blocks_directory) if
os.path.isfile(os.path.join(blocks_directory, f))]
with open(output_file, "w", encoding='utf8') as file:
files = [open(f, 'r', encoding='utf8') for f in files_to_merge]
current_words = [__parse_line(l) for l in [f.readline() for f in files]]
while len(current_words):
cur = current_words[0][0]
f = []
arr = current_words[0][1]
for i in range(len(files)):
if current_words[i][0] == cur:
f.append(i)
arr = list(set(arr) | set(current_words[i][1]))
elif current_words[i][0] < cur:
f = [i]
cur = current_words[i][0]
arr = current_words[i][1]
file.write("{} {}\n".format(cur, arr))
to_leave = []
for i in f:
current_words[i] = __parse_line(files[i].readline())
for i in range(len(current_words)):
if current_words[i]:
to_leave.append(i)
else:
files[i].close()
if len(to_leave) < len(files):
temp_files = [files[i] for i in to_leave]
temp_words = [current_words[i] for i in to_leave]
files = temp_files
current_words = temp_words
return True
if __name__ == "__main__":
blocks_directory = 'blocks'
for the_file in os.listdir(blocks_directory):
file_path = os.path.join(blocks_directory, the_file)
try:
os.unlink(file_path)
except Exception as e:
print(e)
start = process_time()
DIRECTORY = "C:\small_db"
token_stream = Token_Stream.Token_Stream(
[os.path.join(DIRECTORY, f) for f in os.listdir(DIRECTORY) if os.path.isfile(os.path.join(DIRECTORY, f))])
spimi_counter = 0
while True:
toCont = spimi_invert(token_stream, blocks_directory + "/block" + str(spimi_counter) + ".txt")
spimi_counter += 1
if toCont:
break
print("DONE\n\n")
end = process_time()
print("BLOCKS CREATED IN", end - start, "s")
start = process_time()
merge_files(blocks_directory, "smallDB_uncompressedSPIMI.txt")
end = process_time()
print("BLOCKS MERGED IN", end - start, "s")