-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
132 lines (97 loc) · 3.76 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# NLTK is very useful for natural language applications
import nltk
# This will be used to tokenize sentences
from nltk.tokenize.toktok import ToktokTokenizer
# This dictionary will be used to expand contractions (e.g. we'll -> we will)
from contractions import contractions_dict
import re
# Unicodedata will be used to remove accented characters
import unicodedata
# BeautifulSoup will be used to remove html tags
from bs4 import BeautifulSoup
# Used for lemmatisation
import spacy
nlp = spacy.load("en_core_web_sm")
def strip_html_tags(text):
"""Remove html tags from text.
"""
soup = BeautifulSoup(text, "html.parser")
stripped_text = soup.get_text()
return stripped_text
def remove_accented_chars(text):
"""Remove accented characters.
"""
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
def remove_special_characters(text):
"""Remove special characters.
"""
text = re.sub('[^a-zA-z0-9\s]', '', text)
return text
def lemmatize_text(text):
"""Lemmatise the text.
"""
text = nlp(text)
return ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
def expand_contractions(text, contraction_mapping=contractions_dict):
"""Find and expand text contractions
"""
contractions_pattern = re.compile('({})'.format('|'.join(contractions_dict.keys())),
flags=re.IGNORECASE | re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(
match) else contraction_mapping.get(match.lower())
return first_char + expanded_contraction[1:] if expanded_contraction != None else match
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
nltk.download('stopwords')
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')
def remove_stopwords(text, is_lower_case=False):
tokenizer = ToktokTokenizer()
tokens = tokenizer.tokenize(text)
tokens = [token.strip() for token in tokens]
if is_lower_case:
filtered_tokens = [token for token in tokens if token not in stopword_list]
else:
filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
accented_char_removal=True, text_lower_case=True,
text_lemmatization=True, special_char_removal=True,
stopword_removal=True):
doc = corpus
# strip HTML
if html_stripping:
doc = strip_html_tags(doc)
# remove accented characters
if accented_char_removal:
doc = remove_accented_chars(doc)
# expand contractions
if contraction_expansion:
doc = expand_contractions(doc)
# Lowercase the text
if text_lower_case:
doc = doc.lower()
# remove extra newlines
doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)
# insert spaces between special characters to isolate them
special_char_pattern = re.compile(r'([{.(-)!}])')
doc = special_char_pattern.sub(" \\1 ", doc)
# lemmatize text
if text_lemmatization:
doc = lemmatize_text(doc)
# remove special characters
if special_char_removal:
doc = remove_special_characters(doc)
# remove extra whitespace
doc = re.sub(' +', ' ', doc)
# remove stopwords
if stopword_removal:
doc = remove_stopwords(doc, is_lower_case=text_lower_case)
return doc