Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102783579
ex1b.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Feb 24, 03:48
Size
3 KB
Mime Type
text/x-c
Expires
Wed, Feb 26, 03:48 (2 d)
Engine
blob
Format
Raw Data
Handle
24423005
Attached To
R6092 InfoSec
ex1b.py
View Options
import
pandas
import
numpy
as
np
#loading the data
#preprocess .csv file before loading and eliminate all commas in movie titles!
dedis
=
pandas
.
read_csv
(
'dedis-2.csv'
,
names
=
[
'email'
,
'movie'
,
'date'
,
'rating'
],
quotechar
=
'"'
)
imdb
=
pandas
.
read_csv
(
'imdb-2.csv'
,
names
=
[
'email'
,
'movie'
,
'date'
,
'rating'
],
quotechar
=
'"'
)
def
frequency_analysis
():
#count frequency of each movie in imbd and create new database sorted by that
#count frequency
freq_movie_name
=
pandas
.
DataFrame
(
imdb
.
groupby
(
'movie'
)
.
size
()
.
reset_index
())
freq_movie_name
.
columns
=
[
'movie'
,
'size'
]
#sort
freq_movie_name
=
freq_movie_name
.
sort_values
(
'size'
)
.
reset_index
()
#count frequency of each movie in dedis and create new database sorted by that
#count frequency
freq_movie_hash
=
pandas
.
DataFrame
(
dedis
.
groupby
(
'movie'
)
.
size
()
.
reset_index
())
freq_movie_hash
.
columns
=
[
'movie'
,
'views'
]
#sort
freq_movie_hash
=
freq_movie_hash
.
sort_values
(
'views'
)
.
reset_index
()
#combine both sorted frequency tables, to get a mapping from movie hash to movie name
joined
=
freq_movie_name
.
copy
()
joined
=
joined
.
join
(
other
=
freq_movie_hash
,
lsuffix
=
'_imdb'
,
rsuffix
=
'_dedis'
)
return
joined
def
hashed_user_movie_list
(
user_hash
,
freq_table
):
#get all movies that the user_hash has watched
user_hashed_movies
=
dedis
[
dedis
[
'email'
]
.
str
.
contains
(
user_hash
)][
'movie'
]
#convert watched movies to true movies
user_movies
=
freq_table
.
isin
(
user_hashed_movies
.
tolist
())[
'movie_dedis'
]
#return list of watched movies of givem user
return
freq_table
[
user_movies
][
'movie_imdb'
]
.
tolist
()
def
num_different_movies
(
my_movies
,
user_movies
):
#return watched which movies are same
same_movies
=
list
(
set
(
my_movies
)
.
intersection
(
user_movies
))
#return amount of different watched moveis
return
len
(
my_movies
)
-
len
(
same_movies
)
def
movie_difference
(
email_hash
,
movie_list
,
freq_table
):
difference
=
num_different_movies
(
movie_list
,
hashed_user_movie_list
(
email_hash
,
freq_table
))
return
difference
def
find_my_email_hash
(
my_movies
,
freq_table
):
#table of all user hashes
all_user_hashes
=
pandas
.
DataFrame
(
dedis
.
groupby
(
'email'
)
.
size
()
.
reset_index
())
all_user_hashes
.
columns
=
[
'email'
,
'num_watched_movies'
]
for
_
,
user
in
all_user_hashes
.
iterrows
():
if
movie_difference
(
user
[
'email'
],
my_movies
,
freq_table
)
==
0
:
return
user
[
'email'
]
def
find_my_movies
(
movie_hash
,
freq_table
):
#print(movie_hash)
#find corresponding movie for given movie hash in frequency table
print
(
freq_table
[
freq_table
[
'movie_dedis'
]
==
movie_hash
]
.
iloc
[
0
][
'movie_imdb'
])
def
main
():
#my movies in imdb
movies
=
imdb
[
imdb
[
'email'
]
==
'dennis.gankin@epfl.ch'
]
movie_list
=
list
(
movies
[
'movie'
])
freq_table
=
frequency_analysis
()
#find my email hash
my_mail_hash
=
find_my_email_hash
(
movie_list
,
freq_table
)
#get all the movie hashes that I have watched
my_movie_hashes
=
dedis
[
dedis
[
'email'
]
==
my_mail_hash
]
#for each of my hashes find corresponding movie name
print
(
"Watched movies:"
)
my_movie_hashes
[
'movie'
]
.
apply
(
lambda
x
:
find_my_movies
(
x
,
freq_table
))
if
__name__
==
"__main__"
:
main
()
Event Timeline
Log In to Comment