Page MenuHomec4science

ex1b.py
No OneTemporary

File Metadata

Created
Mon, Feb 24, 03:48
import pandas
import numpy as np
#loading the data
#preprocess .csv file before loading and eliminate all commas in movie titles!
dedis = pandas.read_csv('dedis-2.csv', names=['email', 'movie', 'date', 'rating'], quotechar='"')
imdb = pandas.read_csv('imdb-2.csv', names=['email', 'movie', 'date', 'rating'], quotechar='"')
def frequency_analysis():
#count frequency of each movie in imbd and create new database sorted by that
#count frequency
freq_movie_name = pandas.DataFrame(imdb.groupby('movie').size().reset_index())
freq_movie_name.columns = ['movie', 'size']
#sort
freq_movie_name = freq_movie_name.sort_values('size').reset_index()
#count frequency of each movie in dedis and create new database sorted by that
#count frequency
freq_movie_hash = pandas.DataFrame(dedis.groupby('movie').size().reset_index())
freq_movie_hash.columns = ['movie', 'views']
#sort
freq_movie_hash = freq_movie_hash.sort_values('views').reset_index()
#combine both sorted frequency tables, to get a mapping from movie hash to movie name
joined = freq_movie_name.copy()
joined = joined.join(other=freq_movie_hash, lsuffix='_imdb', rsuffix='_dedis')
return joined
def hashed_user_movie_list(user_hash,freq_table):
#get all movies that the user_hash has watched
user_hashed_movies = dedis[dedis['email'].str.contains(user_hash)]['movie']
#convert watched movies to true movies
user_movies= freq_table.isin(user_hashed_movies.tolist())['movie_dedis']
#return list of watched movies of givem user
return freq_table[user_movies]['movie_imdb'].tolist()
def num_different_movies(my_movies, user_movies):
#return watched which movies are same
same_movies=list(set(my_movies).intersection(user_movies))
#return amount of different watched moveis
return len(my_movies) - len(same_movies)
def movie_difference(email_hash,movie_list,freq_table):
difference = num_different_movies(movie_list, hashed_user_movie_list(email_hash,freq_table))
return difference
def find_my_email_hash(my_movies,freq_table):
#table of all user hashes
all_user_hashes = pandas.DataFrame(dedis.groupby('email').size().reset_index())
all_user_hashes.columns = ['email', 'num_watched_movies']
for _, user in all_user_hashes.iterrows():
if movie_difference(user['email'],my_movies,freq_table)==0:
return user['email']
def find_my_movies(movie_hash,freq_table):
#print(movie_hash)
#find corresponding movie for given movie hash in frequency table
print(freq_table[freq_table['movie_dedis'] == movie_hash].iloc[0]['movie_imdb'])
def main():
#my movies in imdb
movies = imdb[imdb['email'] == 'dennis.gankin@epfl.ch']
movie_list = list(movies['movie'])
freq_table=frequency_analysis()
#find my email hash
my_mail_hash= find_my_email_hash(movie_list,freq_table)
#get all the movie hashes that I have watched
my_movie_hashes = dedis[dedis['email'] == my_mail_hash]
#for each of my hashes find corresponding movie name
print ("Watched movies:")
my_movie_hashes['movie'].apply(lambda x: find_my_movies(x,freq_table))
if __name__ == "__main__":
main()

Event Timeline