-
Notifications
You must be signed in to change notification settings - Fork 0
/
content.py
111 lines (86 loc) · 4.67 KB
/
content.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
import pandas as pd
from ast import literal_eval
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from helper import *
#from unidecode import unidecode
def content_desc(title,md,ls,filters={}):
smd1 = md[md['id'].isin(ls)]
if not len(smd1[smd1['title']==title]):
print("Error: I haven't seen this movie title in my collection before. Can't proceed.")
return
smd = filter_data(smd1,filters)
if not len(smd[smd['title']==title]):
print("Warning: Given title not in required filters! I'm proceeding with the query but the results might not be correct.")
smd = pd.concat([smd,smd1[smd1['title']==title]])
smd['desc'] = (smd['overview'].fillna('') + smd['tagline'].fillna('')).fillna('')
tfidf = TfidfVectorizer(analyzer='word', stop_words="english", ngram_range=(1, 2))
matrix = tfidf.fit_transform(smd['desc'])
cosine = linear_kernel(matrix, matrix)
smd = smd.reset_index()
indices = pd.Series(smd.index, index=smd['title'])
# print(smd.reindex(['title'],axis="columns")['title'])
idx = indices[title]
scores = list(enumerate(cosine[idx]))
scores = sorted(scores, key=lambda x: x[1], reverse=True)
scores = scores[1:501]
#print(scores)
movies = [i[0] for i in scores]
return smd.iloc[movies]
def content_metadata(title,md,ls,cr,kw,filters={}):
md = md.merge(cr, on='id').merge(kw, on='id')
smd1 = md[md['id'].isin(ls)]
if not len(smd1[smd1['title']==title]):
print("Error: I haven't seen this movie title in my collection before. Can't proceed.")
return
smd = filter_data(smd1,filters)
if not len(smd[smd['title']==title]):
print("Warning: Given title not in required filters! I'm proceeding with the query but the results might not be correct.")
smd = pd.concat([smd,smd1[smd1['title']==title]])
smd['cast'] = smd['cast'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else []).apply(lambda x: x[:3] if len(x) >=3 else x).apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))
smd['crew'] = smd['crew'].apply(literal_eval)
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))
smd['director'] = smd['crew'].apply(get_director).astype('str').apply(lambda x: str.lower(x.replace(" ", ""))).apply(lambda x: [x,x,x])
smd['keywords'] = smd['keywords'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]
stemmer = SnowballStemmer('english')
smd['keywords'] = smd['keywords'].apply(lambda x: filter_keywords(x,s)).apply(lambda x: [stemmer.stem(i) for i in x]).apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
if "genres" in filters.keys():
smd['soup'] = (smd['keywords'] + smd['cast'] + smd['director']).apply(lambda x: ' '.join(x))
else:
smd['soup'] = (smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']).apply(lambda x: ' '.join(x))
count = CountVectorizer(analyzer='word', stop_words="english", ngram_range=(1, 2))
matrix = count.fit_transform(smd['soup'])
cosine = cosine_similarity(matrix, matrix)
smd = smd.reset_index()
indices = pd.Series(smd.index, index=smd['title'])
idx = indices[title]
scores = list(enumerate(cosine[idx]))
scores = sorted(scores, key=lambda x: x[1], reverse=True)
scores = scores[1:501]
#print(scores)
movies = [i[0] for i in scores]
movies = smd.iloc[movies]
vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int').quantile(0.5)
vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('float').mean()
movies = movies[(movies['vote_count'].notnull()) & (movies['vote_average'].notnull()) & (movies['vote_count'] >= vote_counts)]
movies['vote_count'], movies['vote_average'] = movies['vote_count'].astype('int'), movies['vote_average'].astype('float')
movies['rating'] = movies.apply(lambda x: weighted_rating(x,vote_counts,vote_averages), axis=1)
movies = movies.sort_values('rating', ascending=False)
#print(movies.columns)
return movies
if __name__ == "__main__": # See analysis.ipynb for more
from helper import *
md = preprocess_md(read_metadata())
ls = preprocess_ls(read_links_small())
cr = preprocess_cr(read_credits())
kw = preprocess_kw(read_keywords())
print(content_desc('The Hangover Part III',md,ls))
print(content_metadata('The Hangover Part III',md,ls,cr,kw))