Page MenuHomec4science

ex1c.py
No OneTemporary

File Metadata

Created
Mon, Feb 24, 02:51
import pandas
from datetime import datetime, timedelta
#loading the data
#preprocess .csv file before loading and eliminate all commas in movie titles!
dedis = pandas.read_csv('dedis-3.csv', names=['email', 'movie', 'date', 'rating'], quotechar='"')
imdb = pandas.read_csv('imdb-3.csv', names=['email', 'movie', 'date', 'rating'], quotechar='"')
#convert dates to date, to be able to use it later
dedis['date'] = pandas.to_datetime(dedis['date'], format=' %d/%m/%y')
imdb['date'] = pandas.to_datetime(imdb['date'], format=' %d/%m/%y')
def get_possible_mails(rating,date):
#find all movies with the same rating
movies_same_rating = dedis[dedis['rating'] == rating]
#get all movies with same rating in the possible time range
date_range = pandas.date_range((date-timedelta(days=13)).strftime('%Y-%m-%d'), (date+timedelta(days=13)).strftime('%Y-%m-%d'))
matched_movies_list = movies_same_rating[movies_same_rating['date'].isin(date_range)]
#return possible email_hashes
return matched_movies_list['email'].to_frame()
def get_mail_hash(my_movies):
possible=pandas.DataFrame(columns=['email'])
#basically frequency analysis. get all possible email_hashes in date range for each movie from imdb
for _, row in my_movies.iterrows():
possible=possible.append(get_possible_mails(row['rating'],row['date']))
#count and order by frequency of email hashes and return the most frequent one
possible = possible.groupby('email').size()
possible = pandas.DataFrame(possible.reset_index())
possible.columns = ['email','occurance']
possible = possible.sort_values('occurance').reset_index()
return possible['email'][len(possible)-1]
def get_possible_movies(date):
#get all movies in the possible time range
date_range = pandas.date_range((date-timedelta(days=13)).strftime('%Y-%m-%d'), (date+timedelta(days=13)).strftime('%Y-%m-%d'))
matched_movies_list = imdb[imdb['date'].isin(date_range)]
#returning dataframe
return matched_movies_list
def find_imdb_movie(movie):
#maps an imbd movie title tpo the given movie hash (as parameter)
#get all entries for the given movie hash in dedis
all_views=dedis[dedis['movie']==movie]
# basically a frequency analysis
#empty data frame, put all movies that can correspond to given one here
possible=pandas.DataFrame(columns=['movie'])
#compute corresponding movies
for _,row in all_views.iterrows():
possible=possible.append(get_possible_movies(row['date']))
#count occurance of movie as a possible movie and return the most frequent one
possible = possible.groupby('movie').size()
possible = pandas.DataFrame(possible.reset_index())
possible.columns = ['movie','occurance']
possible = possible.sort_values('occurance').reset_index()
return possible['movie'][len(possible)-1]
def map_movies(my_hash):
watched=[]
my_movie_hashes=dedis[dedis['email']==my_hash]
#map all my movie_hashes and append the title to watched
for _,movie in my_movie_hashes.iterrows():
watched.append(find_imdb_movie(movie['movie']))
print(watched)
print ("Do not forget to change stuff from the preprocessing and change /' to ' ")
return watched
def main():
my_movies = imdb[imdb['email'] == 'dennis.gankin@epfl.ch']
my_hash=get_mail_hash(my_movies)
map_movies(my_hash)
if __name__ == "__main__":
main()

Event Timeline