-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_ontology.py
377 lines (315 loc) · 14.1 KB
/
gen_ontology.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
import requests
import re
from string import Template
import json
import networkx as nx
from networkx.drawing.nx_agraph import write_dot
import itertools
G = nx.DiGraph()
# used by capitalizeURI as an exclusion list, words inside will not have its letters after the first one lowercased
capital_word_list = [
"BPM"
]
UPDATE_CITATIONS = False
def capitalizeURI(word):
if word not in capital_word_list:
return word.capitalize()
else:
return word
def get_paper_from_id(papers, paper_id):
for p in papers:
if p['ID'] == paper_id:
return p
return False
def get_proposal_from_id(proposals, paper_id):
for p in proposals:
if p['ID'] == paper_id:
return p
return False
def get_proposals_from_URI(proposals, paper_uri):
compatible_patterns = []
for p in proposals:
if parse_to_URI(p['Name']) == paper_uri:
compatible_patterns.append(p)
return compatible_patterns
# parse to ontology URI (first letter in caps, no special character)
def parse_to_URI(name):
nameArray = re.split('[, \-!?:()/&*]+', name)
nameArray = [capitalizeURI(word) for word in nameArray]
name = ''.join(nameArray)
return name
# parse to ontology relation (first letter in lowercase, no special character)
def parse_to_relation(name):
nameArray = re.split('[, \-!?:()/&*]+', name)
nameArray = [nameArray[0].lower()] + [capitalizeURI(nameArray[i]) for i in range(1, len(nameArray)) if i != 0]
name = ''.join(nameArray)
return name
def get_proposal_URI(proposal, papers):
paper = get_paper_from_id(papers, proposal['Paper'])
return parse_to_URI(proposal['Name']) + get_paper_URI(paper)
def get_proposal_name(proposal, papers):
paper = get_paper_from_id(papers, proposal['Paper'])
return "{} (Paper {})".format(proposal['Name'], paper['ID'])
def get_paper_URI(paper):
return 'Paper' + paper['ID']
# generate the links between proposals. 5 different links exists, they are listed in the array below for easy detection in the dict
def get_links_between_proposals(proposals, example_mapping, papers):
relations = {}
link_types = ["Created from", "Related to", "Variant of", "Requires", "Benefits from"]
for proposal in proposals:
proposal_uri = get_proposal_URI(proposal, papers)
for key in link_types:
if key in proposal:
proposal_relations = re.split(', ', proposal[key])
relation_type = parse_to_relation(key)
for r in proposal_relations:
if r.isdigit():
target = get_proposal_from_id(proposals, r)
target_uri = get_proposal_URI(target, papers)
else:
if parse_to_URI(r) in example_mapping:
target_uri = example_mapping[parse_to_URI(r)]
else:
target_uri = "Unknown"
if (proposal_uri not in relations):
relations[proposal_uri] = []
relations[proposal_uri].append({
'target': target_uri,
'relation': relation_type
})
return relations
# generate the links between proposals. 5 different links exists, they are listed in the array below for easy detection in the dict
def get_proposal_links_parsed(proposal_uri, proposals_links):
if proposal_uri in proposals_links:
relation_template = load_template('relation')
relations_str = ''
for link in proposals_links[proposal_uri]:
relations_str += Template(relation_template).substitute(
relation=link['relation'],
value=link['target']
)
return relations_str
else:
return ''
# load the SLR data as a dict, returns three different dicts: papers, proposals, and classes
def load_SLR_data():
r = requests.get('https://raw.githubusercontent.com/harmonica-project/blockchain-patterns-collection/main/collection.json')
if (r.status_code == 200):
content = r.json()
return content['Papers'], content['Proposals'], content['Patterns']
else:
print('Cannot retrieve the JSON pattern file from GitHub.')
quit()
# load an ontology template
def load_template(item):
with open('./templates/' + item + '.txt', 'r') as file:
return file.read()
def get_application_examples(p):
examples = ""
proposal_template = load_template('example')
if 'Application examples' in p:
for example in p['Application examples'].split("• "):
if len(example):
examples += Template(proposal_template).substitute(example=example.replace("\n", ""))
return examples
else:
return ''
# create the mappings between proposal alternative names and classes
def create_proposal_to_class_mapping(pattern_classes):
proposal_mapping = {}
for c in pattern_classes:
if 'Alternative names' in c:
for an in c['Alternative names'].split(', '):
proposal_mapping[parse_to_URI(an)] = parse_to_URI(c['Name'])
proposal_mapping[parse_to_URI(c['Name'])] = parse_to_URI(c['Name'])
return proposal_mapping
# used to parse context/solution fields of pattern objects into litterals that can be attached to individuals
def parse_to_ontology_literal_if_exists(item, key):
if key in item:
return item[key].replace('"', '').replace("\n", "")
else:
return ""
def get_paper_properties(paper):
property_template = load_template('property')
properties = (
Template(property_template).substitute(property="Authors", value='"{}"^^rdfs:Literal'.format(paper['author'])) +
Template(property_template).substitute(property="Year", value='"{}"^^rdfs:Literal'.format(paper['year'])) +
Template(property_template).substitute(property="Title", value='"{}"^^rdfs:Literal'.format(paper['Title']))
)
# improve by iterating on paper properties instead of that, but the paper spread sheet must be cleaned a bit
# another improval: use an ontology for papers like a bibtex based ontology (don't know if it exists)
if "Identifier" in paper:
properties += Template(property_template).substitute(property="Identifier", value='"{}"^^rdfs:Literal'.format(paper['Identifier']))
if "Identifier type" in paper:
properties += Template(property_template).substitute(property="IdentifierType", value='"{}"^^rdfs:Literal'.format(paper['Identifier type']))
if "journal" in paper:
properties += Template(property_template).substitute(property="Journal", value='"{}"^^rdfs:Literal'.format(paper['journal']))
if "pages" in paper:
properties += Template(property_template).substitute(property="Pages", value='"{}"^^rdfs:Literal'.format(paper['pages']))
if "volume" in paper:
properties += Template(property_template).substitute(property="Volume", value='"{}"^^rdfs:Literal'.format(paper['volume']))
if "type" in paper:
properties += Template(property_template).substitute(property="DocumentType", value='"{}"^^rdfs:Literal'.format(paper['type']))
return properties
def generate_citations(papers):
print('Generating citations, this may take a while ...')
citations = ''
papers = {int(paper["ID"]): {"title": paper["Title"], "doi": paper["Identifier"]}
for paper in papers if "Identifier" in paper}
max_id=max(papers.keys())
for identifier, data in list(papers.items()):
G.add_node(identifier,snowball=False,doi=data["doi"])
resp = requests.get(
f"https://scholar.miage.dev/snowball?title={data['title']}", headers={"Accept": "application/json"})
if resp.status_code == 200:
resp_json = resp.json()
for ref in resp_json:
if ref == "":
continue
if ref not in set([paper["doi"] for _, paper in papers.items()]):
max_id+=1
new_identifier = max_id
papers.update({new_identifier: {"doi": ref}})
G.add_node(new_identifier, snowball=True,doi=ref)
G.add_edge(new_identifier, identifier)
print(
f"New paper {ref} cites {data['doi']}")
else:
edge_identifier = [
iid for iid in papers.keys() if papers[iid]["doi"] == ref][0]
if identifier != edge_identifier:
G.add_edge(edge_identifier,identifier)
print(
f"New paper {ref} and cites {papers[edge_identifier]['doi']}")
for line in list(itertools.chain(*[[f":Identifier{citer} :references :Identifier{citee}." for citee in G[citer]] for citer in G.nodes])):
citations += line + "\n"
for line in list(itertools.chain(*[[f":Identifier{doi} rdf:type :Identifier." for doi in G.nodes]])):
citations += line + "\n"
for line in list(itertools.chain(*[[f":Identifier{doi} rdfs:label \"{G.nodes[doi]['doi']}\"^^rdfs:Literal." for doi in G.nodes]])):
citations += line + "\n"
return citations
def generate_papers(papers_list):
paper_template = load_template('paper')
papers = ""
for paper in papers_list:
papers += Template(paper_template).substitute(
title=paper['Title'],
properties=get_paper_properties(paper),
owner="nicolas",
id=paper['ID']
)
return papers
# generate_proposals() returns the proposals found in papers
def generate_proposals(proposals, proposal_mapping, proposals_links, papers, variants_mapping):
proposals_str = ""
proposal_template = load_template('proposal')
variants = {}
# iterate on pattern proposals to generate "proposal patterns" individuals
for p in proposals:
# get associated paper from pattern paper id
paper = get_proposal_from_id(papers, p['Paper'])
proposal_uri = get_proposal_URI(p, papers)
paper_uri = get_paper_URI(paper)
if (proposal_uri in variants_mapping):
refPattern = variants_mapping[proposal_uri]
variant = parse_to_URI(p['Name'])
else:
refPattern = proposal_mapping[parse_to_URI(p['Name'])]
variant = proposal_mapping[parse_to_URI(p['Name'])]
# generate pattern individuals, connected to their classes
proposals_str += Template(proposal_template).substitute(
owner="nicolas",
proposal_uri=proposal_uri,
paper_uri=paper_uri,
name=get_proposal_name(p, papers),
variant=variant,
blockchain=parse_to_URI(p['Target']),
domain=parse_to_URI(p['Applicability domain']),
context=parse_to_ontology_literal_if_exists(p, 'Context & Problem'),
solution=parse_to_ontology_literal_if_exists(p, 'Solution'),
links=get_proposal_links_parsed(proposal_uri, proposals_links),
examples=get_application_examples(p),
language=parse_to_URI(p['Language'])
)
return proposals_str
# generate_patterns() returns all pattern classes
def generate_variants(pattern_classes, proposals, papers):
variants_str = ''
variants_mapping = {}
variant_template = load_template('variant')
for p in pattern_classes:
if ('Variant' in p):
for p_id in p['Variant'].split(', '):
proposal = get_proposal_from_id(proposals, p_id)
proposal_uri = get_proposal_URI(proposal, papers)
variants_mapping[proposal_uri] = parse_to_URI(p['Name'])
variants_str += Template(variant_template).substitute(
owner="nicolas",
uri=parse_to_URI(proposal['Name']),
refClass=parse_to_URI(p['Name']),
name=proposal['Name']
)
variants_str += Template(variant_template).substitute(
owner="nicolas",
uri=parse_to_URI(p['Name']),
refClass=parse_to_URI(p['Name']),
name=p['Name']
)
return variants_str, variants_mapping
# generate_patterns() returns all pattern classes
def generate_patterns(pattern_classes):
classes = ''
class_template = load_template('class')
for p in pattern_classes:
patternType = parse_to_URI(p['Type (determined)'])
# links individual to its class
if patternType == "ArchitecturalPattern" or patternType == "Idiom":
patternCategory = patternType
else:
if "Subsubcategory" in p:
patternCategory = parse_to_URI(p['Subsubcategory'])
else:
patternCategory = parse_to_URI(p['Subcategory'])
# create all classes using unique pattern names (will be canonical individuals later)
classes += Template(class_template).substitute(
owner="nicolas",
uri=parse_to_URI(p['Name']),
name=p['Name'],
category=patternCategory
)
return classes
# run() entry point of the script
# get templates and SLR data, iterate on canonicals then on examples to generate the classes + canonicals and examples
def run():
papers, proposals, pattern_classes = load_SLR_data()
with open('../ontologies/structure.ttl', 'r') as file:
ontology_structure = file.read()
# a double mapping is returned by this function: one maps a canonical name into an array of possible alternative names
# and the other one maps an example name to its canonical
proposal_mapping = create_proposal_to_class_mapping(pattern_classes)
proposals_links = get_links_between_proposals(proposals, proposal_mapping, papers)
patterns_ttl = generate_patterns(pattern_classes)
variants_ttl, variants_mapping = generate_variants(pattern_classes, proposals, papers)
proposals_ttl = generate_proposals(proposals, proposal_mapping, proposals_links, papers, variants_mapping)
papers_ttl = generate_papers(papers)
citations_ttl = generate_citations(papers)
if (UPDATE_CITATIONS):
print("Not managed by this script yet.")
citations_ttl = ""
else:
with open("./results/citation_triples.ttl", "r") as text_file_citations:
citations_ttl = text_file_citations.read()
# write classes, papers and proposals in three distinct files, can be merged into a complete ontology
with open("./results/classes.ttl", "w") as text_file_classes:
text_file_classes.write(patterns_ttl)
with open("./results/proposals.ttl", "w") as text_file_proposals:
text_file_proposals.write(proposals_ttl)
with open("./results/papers.ttl", "w") as text_file_papers:
text_file_papers.write(papers_ttl)
with open("./results/variants.ttl", "w") as text_file_variants:
text_file_variants.write(variants_ttl)
with open("./results/citations.ttl", "w") as text_file_citations:
text_file_citations.write(citations_ttl)
with open("../ontologies/ontology.ttl", "w") as text_file_ontology:
text_file_ontology.write(ontology_structure + patterns_ttl + proposals_ttl + papers_ttl + variants_ttl + citations_ttl)
run()