-
Notifications
You must be signed in to change notification settings - Fork 2
/
articles_preprocessing_for_word2vec.py
executable file
·118 lines (82 loc) · 4.16 KB
/
articles_preprocessing_for_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import codecs
import os
import nltk
import re
import pickle
import pandas as pd
from os import path, listdir
from glob import glob
from pathlib import Path
from tqdm import tqdm
def load_stop_words():
# load greek stop words from file (taken from spaCy)
stop_words_file = open('stop_words.txt', 'r', encoding='utf-8-sig')
stop_words = []
for line in stop_words_file.readlines():
stop_words.append(line[:-1])
return stop_words
def text_processing(raw_text):
# remove HTML4 characters https://www.w3schools.com/charsets/ref_html_entities_4.asp
#raw_text = re.sub(r'&[a-zA-Z0-9]+;', ' ', raw_text)
# remove punctuation to prevent splitting acronyms
# example ΕΛ.ΑΣ would be ΕΛ ΑΣ or Ε.Ε would be Ε Ε
#raw_text = re.sub(r'[^\w\s]', '', raw_text)
# keep tokens containing the following characters
processed_text = re.sub('[^Α-ΩΆΈΌΊΏΉΎΫΪ́α-ωάέόίώήύϊΐϋΰ]', ' ', raw_text)
return processed_text
def create_dictionary_of_articles():
# define original article destination
original_articles_destination = Path('articles') / 'original'
# create list of article source paths
media_sources_paths = original_articles_destination.glob('*')
# create a dictionary of dataframes per source
media_sources_articles = {}
for media_source in media_sources_paths:
# extract media source name from path
media_source_name = media_source.name.split('_articles')[0]
# list files in folder
csv_files = media_source.glob('*.csv')
df_list = []
for csv_file in csv_files:
media_source_df = pd.read_csv(csv_file)
# transform time column to pandas datetime
media_source_df['upload_time'] = pd.to_datetime(media_source_df['upload_time'], utc=True)
media_source_df['update_time'] = pd.to_datetime(media_source_df['update_time'], utc=True)
df_list.append(media_source_df)
merged_articles_df = pd.concat(df_list, axis=0)
media_sources_articles[media_source_name] = merged_articles_df
return media_sources_articles
if __name__ == "__main__":
# load nltk greek tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/greek.pickle')
# greek stop words from spaCy
stop_words = load_stop_words()
# a dictionary containing a dataframe of articles for each source
media_sources_articles = create_dictionary_of_articles()
for media_source_name, articles_df in media_sources_articles.items():
total_articles = len(articles_df.index)
print(media_source_name)
# create pickle files containing all sentences of articles from each media source
media_source_sentences_pkl = open(path.join('articles', 'processed', 'word2vec_pickles', '{}.pkl'.format(media_source_name)), 'wb')
for article_index, article in tqdm(articles_df.iterrows(), total=total_articles):
if isinstance(article['main_text'], str):
# using nltk greek tokenizer tokenize article to sentences
raw_sentences = tokenizer.tokenize(article['main_text'])
for raw in raw_sentences:
# process sentence
clean = text_processing(raw)
# split sentence to tokens list
tokens = clean.split()
# init filtered words list
filtered_tokens = []
for token in tokens:
token = token.lower()
# keep only tokens of 2 characters or more
if len(token) >= 2:
# remove stop words
if token not in stop_words:
filtered_tokens.append(token)
#print(filtered_tokens)
# discard lists that are empty
if len(filtered_tokens) > 0:
pickle.dump(filtered_tokens, media_source_sentences_pkl)