File Metadata

Created: Mon, Feb 24, 02:51

ex1c.py
View Options

	import pandas
	from datetime import datetime, timedelta

	#loading the data
	#preprocess .csv file before loading and eliminate all commas in movie titles!
	dedis = pandas.read_csv('dedis-3.csv', names=['email', 'movie', 'date', 'rating'], quotechar='"')
	imdb = pandas.read_csv('imdb-3.csv', names=['email', 'movie', 'date', 'rating'], quotechar='"')

	#convert dates to date, to be able to use it later
	dedis['date'] = pandas.to_datetime(dedis['date'], format=' %d/%m/%y')
	imdb['date'] = pandas.to_datetime(imdb['date'], format=' %d/%m/%y')


	def get_possible_mails(rating,date):
	#find all movies with the same rating
	movies_same_rating = dedis[dedis['rating'] == rating]
	#get all movies with same rating in the possible time range
	date_range = pandas.date_range((date-timedelta(days=13)).strftime('%Y-%m-%d'), (date+timedelta(days=13)).strftime('%Y-%m-%d'))
	matched_movies_list = movies_same_rating[movies_same_rating['date'].isin(date_range)]
	#return possible email_hashes
	return matched_movies_list['email'].to_frame()

	def get_mail_hash(my_movies):
	possible=pandas.DataFrame(columns=['email'])
	#basically frequency analysis. get all possible email_hashes in date range for each movie from imdb
	for _, row in my_movies.iterrows():
	possible=possible.append(get_possible_mails(row['rating'],row['date']))

	#count and order by frequency of email hashes and return the most frequent one
	possible = possible.groupby('email').size()
	possible = pandas.DataFrame(possible.reset_index())
	possible.columns = ['email','occurance']
	possible = possible.sort_values('occurance').reset_index()

	return possible['email'][len(possible)-1]

	def get_possible_movies(date):

	#get all movies in the possible time range
	date_range = pandas.date_range((date-timedelta(days=13)).strftime('%Y-%m-%d'), (date+timedelta(days=13)).strftime('%Y-%m-%d'))
	matched_movies_list = imdb[imdb['date'].isin(date_range)]
	#returning dataframe
	return matched_movies_list


	def find_imdb_movie(movie):
	#maps an imbd movie title tpo the given movie hash (as parameter)
	#get all entries for the given movie hash in dedis
	all_views=dedis[dedis['movie']==movie]

	# basically a frequency analysis
	#empty data frame, put all movies that can correspond to given one here
	possible=pandas.DataFrame(columns=['movie'])

	#compute corresponding movies
	for _,row in all_views.iterrows():
	possible=possible.append(get_possible_movies(row['date']))

	#count occurance of movie as a possible movie and return the most frequent one
	possible = possible.groupby('movie').size()
	possible = pandas.DataFrame(possible.reset_index())
	possible.columns = ['movie','occurance']
	possible = possible.sort_values('occurance').reset_index()

	return possible['movie'][len(possible)-1]

	def map_movies(my_hash):
	watched=[]

	my_movie_hashes=dedis[dedis['email']==my_hash]
	#map all my movie_hashes and append the title to watched
	for _,movie in my_movie_hashes.iterrows():
	watched.append(find_imdb_movie(movie['movie']))

	print(watched)
	print ("Do not forget to change stuff from the preprocessing and change /' to ' ")
	return watched


	def main():

	my_movies = imdb[imdb['email'] == 'dennis.gankin@epfl.ch']

	my_hash=get_mail_hash(my_movies)

	map_movies(my_hash)

	if __name__ == "__main__":
	main()

ex1c.py
No OneTemporary
Actions

File Metadata

ex1c.py
View Options

Event Timeline

ex1c.pyNo OneTemporaryActions

File Metadata

ex1c.pyView Options

Event Timeline

ex1c.py
No OneTemporary
Actions

ex1c.py
View Options