Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102779475
ex1c.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Feb 24, 02:51
Size
3 KB
Mime Type
text/x-python
Expires
Wed, Feb 26, 02:51 (2 d)
Engine
blob
Format
Raw Data
Handle
24422822
Attached To
R6092 InfoSec
ex1c.py
View Options
import
pandas
from
datetime
import
datetime
,
timedelta
#loading the data
#preprocess .csv file before loading and eliminate all commas in movie titles!
dedis
=
pandas
.
read_csv
(
'dedis-3.csv'
,
names
=
[
'email'
,
'movie'
,
'date'
,
'rating'
],
quotechar
=
'"'
)
imdb
=
pandas
.
read_csv
(
'imdb-3.csv'
,
names
=
[
'email'
,
'movie'
,
'date'
,
'rating'
],
quotechar
=
'"'
)
#convert dates to date, to be able to use it later
dedis
[
'date'
]
=
pandas
.
to_datetime
(
dedis
[
'date'
],
format
=
'
%d
/%m/%y'
)
imdb
[
'date'
]
=
pandas
.
to_datetime
(
imdb
[
'date'
],
format
=
'
%d
/%m/%y'
)
def
get_possible_mails
(
rating
,
date
):
#find all movies with the same rating
movies_same_rating
=
dedis
[
dedis
[
'rating'
]
==
rating
]
#get all movies with same rating in the possible time range
date_range
=
pandas
.
date_range
((
date
-
timedelta
(
days
=
13
))
.
strftime
(
'%Y-%m-
%d
'
),
(
date
+
timedelta
(
days
=
13
))
.
strftime
(
'%Y-%m-
%d
'
))
matched_movies_list
=
movies_same_rating
[
movies_same_rating
[
'date'
]
.
isin
(
date_range
)]
#return possible email_hashes
return
matched_movies_list
[
'email'
]
.
to_frame
()
def
get_mail_hash
(
my_movies
):
possible
=
pandas
.
DataFrame
(
columns
=
[
'email'
])
#basically frequency analysis. get all possible email_hashes in date range for each movie from imdb
for
_
,
row
in
my_movies
.
iterrows
():
possible
=
possible
.
append
(
get_possible_mails
(
row
[
'rating'
],
row
[
'date'
]))
#count and order by frequency of email hashes and return the most frequent one
possible
=
possible
.
groupby
(
'email'
)
.
size
()
possible
=
pandas
.
DataFrame
(
possible
.
reset_index
())
possible
.
columns
=
[
'email'
,
'occurance'
]
possible
=
possible
.
sort_values
(
'occurance'
)
.
reset_index
()
return
possible
[
'email'
][
len
(
possible
)
-
1
]
def
get_possible_movies
(
date
):
#get all movies in the possible time range
date_range
=
pandas
.
date_range
((
date
-
timedelta
(
days
=
13
))
.
strftime
(
'%Y-%m-
%d
'
),
(
date
+
timedelta
(
days
=
13
))
.
strftime
(
'%Y-%m-
%d
'
))
matched_movies_list
=
imdb
[
imdb
[
'date'
]
.
isin
(
date_range
)]
#returning dataframe
return
matched_movies_list
def
find_imdb_movie
(
movie
):
#maps an imbd movie title tpo the given movie hash (as parameter)
#get all entries for the given movie hash in dedis
all_views
=
dedis
[
dedis
[
'movie'
]
==
movie
]
# basically a frequency analysis
#empty data frame, put all movies that can correspond to given one here
possible
=
pandas
.
DataFrame
(
columns
=
[
'movie'
])
#compute corresponding movies
for
_
,
row
in
all_views
.
iterrows
():
possible
=
possible
.
append
(
get_possible_movies
(
row
[
'date'
]))
#count occurance of movie as a possible movie and return the most frequent one
possible
=
possible
.
groupby
(
'movie'
)
.
size
()
possible
=
pandas
.
DataFrame
(
possible
.
reset_index
())
possible
.
columns
=
[
'movie'
,
'occurance'
]
possible
=
possible
.
sort_values
(
'occurance'
)
.
reset_index
()
return
possible
[
'movie'
][
len
(
possible
)
-
1
]
def
map_movies
(
my_hash
):
watched
=
[]
my_movie_hashes
=
dedis
[
dedis
[
'email'
]
==
my_hash
]
#map all my movie_hashes and append the title to watched
for
_
,
movie
in
my_movie_hashes
.
iterrows
():
watched
.
append
(
find_imdb_movie
(
movie
[
'movie'
]))
print
(
watched
)
print
(
"Do not forget to change stuff from the preprocessing and change /' to ' "
)
return
watched
def
main
():
my_movies
=
imdb
[
imdb
[
'email'
]
==
'dennis.gankin@epfl.ch'
]
my_hash
=
get_mail_hash
(
my_movies
)
map_movies
(
my_hash
)
if
__name__
==
"__main__"
:
main
()
Event Timeline
Log In to Comment