-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.py
61 lines (41 loc) · 2.03 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
import math
def index():
# Number of docs to process
NO_OF_DOCS = 1000
matrix_tf = dict()
matrix_word_frequency = dict()
# Open file, with key-value pairs of articleID - articleName
with open('analyzed/_indexedArticles.json', 'r') as f:
articles = json.load(f)
for articleIndex in articles:
article_name = articles[articleIndex]
if article_name == '.DS_Store':
continue
with open('analyzed/' + article_name + '.json', 'r') as f:
article_words = json.load(f)
# Find word with most occurrence in the current document
most_occurrence = max(article_words.values())
for word in article_words:
# Calculating tf of a word in a particular document
tf_of_word = article_words[word] / most_occurrence
if word not in matrix_tf.keys():
matrix_tf.update({word: dict()})
matrix_tf[word].update({articleIndex: tf_of_word})
# Store document frequency of a particular term
if word not in matrix_word_frequency.keys():
matrix_word_frequency.update({word: 1})
else:
matrix_word_frequency.update({word: matrix_word_frequency[word] + 1})
# Calculate idf and weight of word
for word in matrix_tf.keys():
for docId in matrix_tf.get(word):
tf_of_word = matrix_tf.get(word).get(docId)
no_of_docs_containing_word = matrix_word_frequency.get(word)
idf_of_word = math.log(NO_OF_DOCS / no_of_docs_containing_word, 2)
weight_of_word = tf_of_word * idf_of_word
matrix_tf[word][docId] = weight_of_word
with open('words_total_frequency.json', 'w') as f:
json.dump(matrix_word_frequency, f)
with open('words_weight_per_doc.json', 'w') as f:
json.dump(matrix_tf, f)