-
Notifications
You must be signed in to change notification settings - Fork 7
/
LeetSpeak.py
338 lines (287 loc) · 14.2 KB
/
LeetSpeak.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
import random,copy,operator
from multiprocessing import Process,Pipe
from Dictionary import Dictionary
from Spelling import Spelling
#from GutenBooks import GutenBooks
#thread info: http://jessenoller.com/2009/02/01/python-threads-and-the-global-interpreter-lock/
#def synchronized():
# the_lock = Lock()
# def fwrap(function):
# def newFunction(*args, **kw):
# with the_lock:
# return function(*args, **kw)
# return newFunction
# return fwrap
class LeetSpeak:
def __init__(self,processes=1):
#number of threads
if processes > 0:
self.processes=processes
else:
self.processes=1
#load word frequency and spell checker
self.spelling=Spelling()
#load the dictionaries
self.jargon = Dictionary("slang")
self.dictionary = self.spelling.dictionary
self.stopwords = self.spelling.stopwords
self.a=["a","4","@","/-\\","/\\","/_\\","^","aye","ci","λ","∂","//-\\\\","/=\\","ae"]
self.b=["b","8","|3","6","13","l3","]3","|o","1o","lo","ß","]]3","|8","l8","18","]8"]
self.c=["c","(","<","[","{","sea","see","k","©","¢","€"]
self.d=["d","|]","l]","1]","|)","l)","1)","[)","|}","l]","1}","])","i>","|>","l>","1>","0","cl","o|","o1","ol","Ð","∂","ð"]
self.e=["e","3","&","[-","€","ii","ə","£","iii"]
self.f=["f","|=","]=","}","ph","(=","[=","ʃ","eph","ph"]
self.g=["g","6","9","&","(_+","C-","gee","jee","(Y,","cj","[","-","(γ,","(_-"]
self.h=["h","|-|","#","[-]","{-}","]-[",")-(","(-)",":-:","}{","}-{","aych","╫","]]-[[","aech"]
self.i=["!","1","|","l","eye","3y3","ai","i"]
self.j=["j","_|","_/","]","</","_)","_l","_1","¿","ʝ","ul","u1","u|","jay","(/","_]"]
self.k=["k","x","|<","|x","|{","/<","\\<","/x","\\x","ɮ","kay"]
self.l=["l","1","7","|_","1_","l_","lJ","£","¬","el"]
self.m=["m","/\/\\","|\\/|","em","|v|","[v]","^^","nn","//\\\\//\\\\","(V)","(\/)","/|\\","/|/|",".\\\\","/^^\\","/V\\","|^^|","JVL","][\\\\//][","[]\/[]","[]v[]","(t)"]
self.n=["n","|\\|","/\\/","//\\\\//","[\\]","<\\>","{\\}","//","[]\\[]","]\\[","~","₪","/|/","in"]
#the ω is because Ω is mistakenly taken as that character sometimes...
self.o=["o","0","()","oh","[]","{}","¤","Ω","ω","*","[[]]","oh"]
self.p=["p","|*","l*","1*","|o","lo","1o","|>","l>","1>","|\"","l\"","1\"","?","9","[]d","|7","l7","17","q","|d","ld","1d","℗","|º","1º","lº","þ","¶","pee"]
self.q=["q","0_","o_","0,","o,","(,)","[,]","<|","<l","<1","cue","9","¶","kew"]
self.r=["r","|2","l2","12","2","/2","I2","|^","l^","1^","|~","l~","1~","lz","[z","|`","l`","1`",".-","®","Я","ʁ","|?","l?","1?","arr"]
self.s=["s","5","$","z","es","2","§","š",",,\\``"]
self.t=["t","7","+","-|-","-l-","-1-","1","']['","†"]
self.u=["u","|_|","l_l","1_1","(_)","[_]","{_}","y3w","m","\\_/","\\_\\","/_/","µ","yew","yoo","yuu"]
self.v=["v","\\/","\\\\//","√"]
self.w=["w","\\/\\/","vv","'//","\\\\'","\\^/","(n)","\\x/","\\|/","\\_|_/","\\_l_/","\\_1_/","\\//\\//","\\_:_/","]i[","uu","Ш","ɰ","1/\\/","\\/1/","1/1/"]
self.x=["x","%","><","><,","}{","ecks","x","*",")(","ex","Ж","×"]
self.y=["y","j","`/","`(","-/","'/","\\-/","Ψ","φ","λ","Ч","¥","``//","\\j","wai"]
self.z=["z","2","~/_","%","7_","ʒ","≥","`/_"]
self.zero=["0","o","zero","cero","()"]
self.one=["1","won","one","l","|","]["]
self.two=["two","to","too","2","z"]
self.three=["e","3","three"]
self.four=["4","four","for","fore","a"]
self.five=["5","five","s"]
self.six=["6","six","g"]
self.seven=["7","seven","t","l"]
self.eight=["8","eight","b"]
self.nine=["9","nine","g"]
#"0":self.zero,"1":self.one,"2":self.two,"3":self.three,"4":self.four,"5":self.five,"6":self.six,"7":self.seven,"8":self.eight,"9":self.nine
self.alphabet={"a":self.a, "b":self.b, "c":self.c, "d":self.d, "e":self.e, "f":self.f, "g":self.g, "h":self.h, "i":self.i, "j":self.j, "k":self.k, "l":self.l, "m":self.m, "n":self.n, "o":self.o, "p":self.p, "q":self.q, "r":self.r, "s":self.s, "t":self.t, "u":self.u, "v":self.v, "w":self.w, "x":self.x, "y":self.y, "z":self.z}
def ConvertToLeet(self,text):
"""
This is fairly straightforward. Randomly select letters from the array of letters and output it.
"""
leet=""
for letter in list(text):
if letter.isalpha() and self.alphabet[letter.lower()]:
values=self.alphabet[letter.lower()]
random.seed()
number=random.randint(1,len(values))
leet+=values[number-1]
else:
leet+=letter
return leet
def rec_parse(self,text,previous=[]):
"""
Input:
Output:
"""
possibilities=[]
text_length=len(list(text))
if text_length > 7:
length=8
else:
length=text_length
for q in range(1,length):
if q < len(text):
possibilities.append(previous+[text[0:q],text[q:text_length]])
possibilities+=self.rec_parse(text[q:text_length],previous+[text[0:q]])
return possibilities
def rec_scan_array(self,array,previous=[]):
"""
Input: [['h'], ['e'], ['i', 'l', 't'], ['i', 'l', 't'], ['d', 'o']]
Output:
['h','e','i','i','d'],
['h','e','i','i','o'],
['h','e','i','1','d'],
['h','e','i','1','o'],
...
"""
words=[]
passon=copy.copy(array)
passon.pop(0)
if len(array) > 0:
for let in array[0]:
letters=copy.copy(previous)
letters.append(let)
if len(passon) > 0:
words+=self.rec_scan_array(passon,letters)
if len(array) == 1:
words.append("".join(letters))
del letters
del passon
return words
def ConvertFromLeet(self,text):
"""
Convert leet to readable English text. Find all possible words, check which are English, check for misspellings, etc.
Uses self.processes, so when creating the LeetSpeak() object, you can specify the number of threads to use: l=LeetSpeak(threads=3)
"""
#figure out how many words each thread should work on
split=text.split(" ")
thread_count={}
thread_words={}
thread_num=1
for word in split:
#add word to the array for the current thread
if thread_num in thread_count:
thread_count[thread_num]+=1
else:
thread_count[thread_num]=1
thread_words[thread_num]=[]
#up the thread_num unless it is currently at the number of threads we want, then set it to 1 to start over again
if self.processes > thread_num:
thread_num+=1
else:
thread_num=1
#compute what words each thread should decode
for num,word in enumerate(split):
for thread,words in thread_words.items():
if len(words) < thread_count[thread]:
thread_words[thread].append(word)
break
#INFORMATION:
#if self.processes = 3 and text = "cows are cool or not", thread_words={1: ['cows', 'are'], 2: ['cool', 'or'], 3: ['not']}
#create the processes
threads={}
num_threads=len(thread_words)
result_english=""
thread_results={}
receive_pipe,send_pipe=Pipe()
for i in range(self.processes):
if num_threads >= i+1:
threads[i]=Process(target=self.ConvertFromLeet_thread,args=(thread_words[i+1],i,send_pipe))
threads[i].start()
#start and wait for threads
for i in range(self.processes):
if num_threads >= i+1:
threads[i].join()
result=receive_pipe.recv()
thread_results[result[0]]=result[1]
#close the pipe
send_pipe.close()
#sort the results
thread_results=sorted(thread_results.items())
#make a string out of the results
for thread,string in thread_results:
result_english+=string+" "
return result_english.strip()
def ConvertFromLeet_thread(self,text,thread_id,pipe):
"""
The function that ConvertFromLeet() calls for each thread.
"""
english=[]
#convert each word
for word in text:
#get all the character locations less than 8 (e.g. "c,ow", "co,w", and "cow" for "cow")
#this uses some recursive substringing
possibilities=self.rec_parse(word.lower())
#append the actual "word" if it is less than 8 characters, since it might be a single letter (e.g. "n" for "and")
if len(word) <= 8:
possibilities.append([word.lower()])
#calculate what this could be in leet (if it can be anything)
validwords=[]
for possibility in possibilities:
letters=[]
valid=1
for char in possibility:
chars=[]
for let,val in self.alphabet.items():
if char in val:
chars.append(let)
if len(chars) == 0:
valid=0
break
else:
letters.append(chars)
del chars
if valid==1 and len(letters) > 0:
#generate possible words from given letters
words=self.rec_scan_array(letters)
validwords+=words
del words
#print(validwords)
#check which valid words are english if there's more than one option
#go with the most frequently used english word
if len(validwords) > 0:
englishwords={}
for valid in validwords:
score=1+5/len(valid)
#computer talk
if self.jargon.Contains(valid) == True:
value=2
jargon=self.jargon.Translate(valid)
if self.dictionary.Contains(jargon) == True:
value=4
score+=value
if len(jargon) > 0:
if jargon in englishwords:
englishwords[jargon]+=value
else:
englishwords[jargon]=score
score=0
#valid english
if len(valid) > 1 and self.dictionary.Contains(valid) == True:
score+=5
#frequency words
if self.stopwords.Contains(valid):
score+=self.spelling.Frequency(valid)
else:
score+=5*self.spelling.Frequency(valid)
#same length
if len(word) == len(valid):
score+=0.1
#no numbers
if valid.isalpha() == True:
score+=1
englishwords[valid]=score
#figure out what word is the most likely to be correctable
check=[]
skip=0
for valid in englishwords:
if valid.isalpha():
#if there is already a good word in the list, then don't bother with looking up spell corrections
if self.dictionary.Contains(valid) and len(valid) >= len(word)/2:
skip=1
check=[]
break
else:
check.append(valid)
if len(check)==0 and skip == 0:
check.append(englishwords[0])
#append the corrected version, hopefully
for item in check:
corrected=self.spelling.Check(item,dictionary=True,fast=True)
if corrected != False and len(corrected) > 0:
word=corrected[0]
if word not in englishwords:
frequency=self.spelling.Frequency(word)
#if it is on the stop list, don't add as much weight
if self.stopwords.Contains(word):
value=frequency+1
else:
value=5*frequency+1
#add weight if in the dictionary
if self.dictionary.Contains(word) == True:
value+=1
#add weight if not numbers
if word.isalpha() == True:
value+=1
englishwords[word]=value
else:
#if one of the corrected words list is in the englishwords list then up that value by 0.1
for correct in corrected:
if correct in englishwords:
englishwords[correct]+=0.1
#get the most likely word
final=sorted(englishwords.items(),key=operator.itemgetter(1),reverse=True)[0]
#add word
english.append(final[0])
#send the result
pipe.send([thread_id," ".join(english)])