Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102782754
run_anonymization.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, Feb 24, 03:36
Size
4 KB
Mime Type
text/x-python
Expires
Wed, Feb 26, 03:36 (2 d)
Engine
blob
Format
Raw Data
Handle
24423072
Attached To
R6092 InfoSec
run_anonymization.py
View Options
#!/usr/bin/env python3
import
random
,
datetime
,
sys
,
csv
import
string
from
random
import
randrange
,
randint
import
os
import
pickle
import
hashlib
date_start
=
datetime
.
date
(
2000
,
1
,
1
)
date_period
=
365
*
17
# Reads in emails.txt and movies.txt and creates 'nbr_movies' entries for each
# email.
# Returns the database, the emails and the movies in the following format:
# [ [ user, movie, date, grade ], ... ]
def
create_db
(
nbr_movies
):
with
open
(
"emails.txt"
)
as
f
:
emails
=
f
.
read
()
.
split
(
"
\n
"
);
while
""
in
emails
:
emails
.
remove
(
""
)
with
open
(
"movies.txt"
)
as
f
:
movies
=
f
.
read
()
.
split
(
"
\n
"
);
while
""
in
movies
:
movies
.
remove
(
""
)
db
=
[]
for
email
in
emails
:
movies_index
=
list
(
range
(
0
,
len
(
movies
)))
random
.
shuffle
(
movies_index
)
for
i
,
f
in
enumerate
(
movies_index
[
0
:
nbr_movies
]):
dat
=
date_start
+
datetime
.
timedelta
(
randint
(
1
,
date_period
))
db
.
append
(
[
email
,
movies
[
f
],
dat
.
strftime
(
"%Y/%m/
%d
"
),
randint
(
1
,
5
)])
return
db
,
emails
,
movies
# Anonymize the given database, but still let the get_movies_with_rating
# function give the right answers.
def
anonymize_1
(
db
):
#just overwrite email and dates to anonymize them completely, do not need them for movie ratings
for
i
,
_
in
enumerate
(
db
):
#name
db
[
i
][
0
]
=
'*'
#date
db
[
i
][
2
]
=
'*'
return
db
# For a given anonymized-database and a rating, this function should return
# the films with the given rating.
def
get_movies_with_rating
(
anon
,
rating
):
movies
=
[]
for
movie
in
anon
:
if
rating
==
movie
[
3
]
and
not
movie
[
1
]
in
movies
:
movies
.
append
(
movie
[
1
])
return
movies
# A bit lesser anonymization than anonymize_1, but still no date. The returned
# database should have enough information to be used by get_top_rated. If you
# use a too simple hashing-function like sha-256, the result will be rejected.
def
anonymize_2
(
db
):
for
i
,
_
in
enumerate
(
db
):
db
[
i
][
0
]
=
str
(
i
)
db
[
i
][
2
]
=
'*'
return
db
# get_top_rated searches for all users having rated a movie and searches their
# top-rated movie(s). It returns a list of all found movies, also doubles!
def
get_top_rated
(
anon
,
movie
):
users
=
{}
#each user will have a rating dictionary with its movies
users_who_rated
=
[]
for
line
in
anon
:
email
=
line
[
0
]
movie_an
=
line
[
1
]
rating
=
line
[
3
]
if
email
in
users
:
users
[
email
][
rating
]
.
append
(
movie_an
)
else
:
#create rating dictionary for that movie, add all keys to prevent key error
rating_dict
=
{
1
:
[],
2
:
[],
3
:
[],
4
:
[],
5
:
[]}
rating_dict
[
rating
]
=
[
movie_an
]
users
[
email
]
=
rating_dict
#save the users that have rated the given movie to check their ratings later
if
movie
==
movie_an
:
users_who_rated
+=
[
email
]
movies
=
[]
for
user
in
users_who_rated
:
#pick all movies from best rating
ratings
=
users
[
user
]
if
ratings
[
5
]:
movies
+=
ratings
[
5
]
elif
ratings
[
4
]:
movies
+=
ratings
[
4
]
elif
ratings
[
3
]:
movies
+=
ratings
[
3
]
elif
ratings
[
2
]:
movies
+=
ratings
[
2
]
else
:
movies
+=
ratings
[
1
]
return
movies
# This is called when you start the script on localhost, and when the
# checker wants to run your functions.
if
__name__
==
"__main__"
:
# This part can be modified at your convenience.
"""if len(sys.argv) == 1:
print("Testing mode")
#db, emails, movies = create_db(10000000)
anon_db1 = anonymize_1(db)
anon_db2 = anonymize_2(db)"""
# If you modify this part, don't complain if it doesn't work anymore!
# This part is used to communicate with the verification-script. So you
# should not touch it (unless you're looking for a bug to exploit the
# verification script - but we didn't plan to put one in there).
if
len
(
sys
.
argv
)
>=
3
:
db_file
,
ex
=
sys
.
argv
[
1
:
3
]
with
open
(
db_file
)
as
f
:
db
=
list
(
csv
.
reader
(
f
,
skipinitialspace
=
True
))
# Get nice ints for comparisons
for
i
,
line
in
enumerate
(
db
):
db
[
i
][
3
]
=
int
(
line
[
3
])
result
=
[]
if
ex
==
"ex1aa"
:
result
=
anonymize_1
(
db
)
elif
ex
==
"ex1ag"
:
rating
=
int
(
sys
.
argv
[
3
])
result
=
[
get_movies_with_rating
(
db
,
rating
)]
elif
ex
==
"ex1ba"
:
result
=
anonymize_2
(
db
)
elif
ex
==
"ex1bg"
:
movie
=
sys
.
argv
[
3
]
result
=
[
get_top_rated
(
db
,
movie
)]
with
open
(
"/tmp/student.csv"
,
"w"
)
as
f
:
writer
=
csv
.
writer
(
f
)
writer
.
writerows
(
iter
(
result
))
Event Timeline
Log In to Comment