-
Notifications
You must be signed in to change notification settings - Fork 1
/
GraphOfDocs_Representation.py
76 lines (64 loc) · 3.62 KB
/
GraphOfDocs_Representation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import sys
import time
import pickle
from neo4j import ServiceUnavailable
from GraphOfDocs_Representation.neo4j_wrapper import Neo4jDatabase
from GraphOfDocs_Representation.graph_algos import GraphAlgos
from GraphOfDocs_Representation.create import *
from GraphOfDocs_Representation.select import *
def graphofdocs(create, initialize, dirpath):
# Open the database.
try:
database = Neo4jDatabase('bolt://localhost:7687', 'neo4j', '123')
# Neo4j server is unavailable.
# This client app cannot open a connection.
except ServiceUnavailable as error:
print('\t* Neo4j database is unavailable.')
print('\t* Please check the database connection before running this app.')
input('\t* Press any key to exit the app...')
sys.exit(1)
if create:
# Delete nodes from previous iterations.
database.execute('MATCH (n) DETACH DELETE n', 'w')
# Create uniqueness constraint on key to avoid duplicate word nodes.
create_unique_constraints(database)
# Create issues from json using the GraphOfDocs model.
start = time.perf_counter()
create_issues_from_json(database, dirpath)
end = time.perf_counter()
print(f'Create papers {end-start} sec')
# Create the similarity graph of topN = 10 similar words using emb. dim. = 300
start = time.perf_counter()
create_word2vec_similarity_graph(database, dirpath, 'jira_issues_300.model', 300)
end = time.perf_counter()
print(f'Created similarity graph in {end-start} sec')
if initialize: # Run initialization functions.
# Calculate the word embeddings for the word similarity graph and write them to csv files.
with GraphAlgos(database, ['Word'], [('similar_w2v', 'NATURAL', ['score'])]) as graph:
for dim in [100, 200, 300]:
# Generate the embeddings in the database.
graph.graphSage(f'gs_{dim}', embedding_dim = dim)
graph.node2vec(f'n2v_{dim}', embedding_dim = dim)
graph.fastRP(f'fastrp_{dim}', embedding_dim = dim)
graph.graphSage(f'gs_weighted_{dim}', embedding_dim = dim, rel_weight = 'score')
graph.fastRP(f'fastrp_weighted_{dim}', embedding_dim = dim, rel_weight = 'score')
# Export the embeddings in csv.
graph.write_word_embeddings_to_csv(f'gs_{dim}', f'gs_{dim}.csv')
graph.write_word_embeddings_to_csv(f'n2v_{dim}', f'n2v_{dim}.csv')
graph.write_word_embeddings_to_csv(f'fastrp_{dim}', f'fastrp_{dim}.csv')
graph.write_word_embeddings_to_csv(f'gs_weighted_{dim}', f'gs_weighted_{dim}.csv')
graph.write_word_embeddings_to_csv(f'fastrp_weighted_{dim}', f'fastrp_weighted_{dim}.csv')
# Construct the Issue similarity graph and calculate its communities.
with GraphAlgos(database, ['Issue', 'Word'], ['includes']) as graph:
graph.nodeSimilarity(write_property = 'score', write_relationship = 'is_similar', cutoff = 0.25, top_k = 1)
with GraphAlgos(database, ['Issue'], ['is_similar']) as graph:
graph.louvain(write_property = 'community')
# Save the n top terms of a community of similar document to a pickle.
for n in [250]:
x = get_communities_tags(database, top_terms = n)
tags = sum(x.values(), [])
with open(f'features_{n}.pkl', 'wb') as f:
pickle.dump(tags, f)
database.close()
return
if __name__ == '__main__': graphofdocs(False, True, 'issues_55k.json')