Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102722925
prepro.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Feb 23, 12:56
Size
3 KB
Mime Type
text/x-python
Expires
Tue, Feb 25, 12:56 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
24410315
Attached To
rTZUCT ML_Project1
prepro.py
View Options
from
string
import
digits
from
string
import
punctuation
from
string
import
ascii_letters
import
re
def
preprocessing
(
name
):
file
=
open
(
name
,
"r"
)
new_file
=
open
(
'pre_'
+
name
,
"w"
)
file_array
=
file
.
read
()
.
splitlines
()
i
=
0
#do preprocessing
for
item
in
file_array
:
if
(
i
%
1000
==
0
):
print
(
i
,
"/"
,
len
(
file_array
))
print
(
item
)
item
=
delete_user
(
item
)
item
=
delete_url
(
item
)
item
=
removePunctuationSpaces
(
item
)
item
=
removeMultipleLetters
(
item
)
#more than three?
item
=
removeDigits
(
item
)
item
=
delete_points
(
item
)
item
=
delete_weird_char
(
item
)
item
=
separateHashtags
(
item
)
item
=
hahaHandler
(
item
)
i
=
i
+
1
new_file
.
write
(
item
+
'
\n
'
)
file
.
close
()
new_file
.
close
()
def
delete_user
(
string
):
return
string
.
replace
(
'<user>'
,
''
)
def
delete_url
(
string
):
return
string
.
replace
(
'<url>'
,
''
)
def
delete_points
(
string
):
string
=
re
.
sub
(
r'([a-zA-Z])\.([a-zA-Z])'
,
r'\1 \1'
,
string
)
string
=
string
.
replace
(
'.'
,
''
)
string
=
re
.
sub
(
r' +'
,
' '
,
string
)
return
string
#string = re.sub(r' \.+ ', ' ', string )
#change points
def
delete_weird_char
(
string
):
string
=
re
.
sub
(
r' [^a-zA-Z()?!] '
,
' '
,
string
)
return
string
# " , ; : ' + * / \ & % $ § ]
# Remove every digit in a string
def
removeDigits
(
string
):
remove_digits
=
str
.
maketrans
(
''
,
''
,
digits
)
res
=
string
.
translate
(
remove_digits
)
return
res
# Removes the spaces between punctuation signs (example: "^ . ^" -> "^.^")
def
removePunctuationSpaces
(
string
):
for
i
in
punctuation
:
for
j
in
punctuation
:
string
=
string
.
replace
(
i
+
' '
+
j
,
i
+
j
)
return
string
# Removes multiple occurrences of the same letter (example: "yeeeeaahhhh" -> "yeah")
def
removeMultipleLetters
(
string
):
string
=
re
.
sub
(
r'([a-zA-Z])\1+'
,
r'\1'
,
string
)
return
string
#for i in ascii_letters:
# while(string.count(i+i)>0):
# string = string.replace(i+i,i)
#return string
def
separateHashtags
(
string
):
string
=
re
.
sub
(
r'[^ ]#'
,
' #'
,
string
)
return
string
# Detects the different versions of laughs ("hahaha", "jajaja", etc.) and and maps them to the same word (or several occurrences of the same word)
def
hahaHandler
(
string
):
string
=
hahaParseLeft
(
string
)
string
=
hahaShorten
(
string
)
# We do it in case there are an odd number of "ha"/"ah"
string
=
hahaParseRight
(
string
)
return
string
# Transforms laughs ("haha") into same single version (example: "hahahaha" -> "haha")
def
hahaShorten
(
string
):
#string = re.sub("(ahah|haha|jaja|ajaj)+","haha",string)
string
=
re
.
sub
(
"(ahah|haha|jaja|ajaj)+(a|h)*"
,
"haha"
,
string
)
return
string
# Transforms a strong laugh ("hahahahahaha") into several small laughs by putting a space on the right of each occurrence of "haha"
# (example: "hahahahahaha" -> "haha haha haha", "hahahaha" -> "haha haha", "hahahahaha" -> "haha hahaha")
def
hahaParseLeft
(
string
):
string
=
re
.
sub
(
"haha|ahah|jaja|ajaj"
,
" haha"
,
string
)
return
string
# Transforms a strong laugh ("hahahahahaha") into several small laughs by putting a space on the right of each occurrence of "haha"
# (example: "hahahahahaha" -> "haha haha haha", "hahahaha" -> "haha haha", "hahahahaha" -> "haha hahaha")
def
hahaParseRight
(
string
):
string
=
re
.
sub
(
"haha|ahah|jaja|ajaj"
,
"haha "
,
string
)
return
string
#def ahCounter(string):
# sentence = string.split()
# aa = [x for x in sentence if (x.count("a")+x.count("h")) > 4]
# if(len(aa)>0):
# print(aa)
Event Timeline
Log In to Comment