Page MenuHomec4science

prepro.py
No OneTemporary

File Metadata

Created
Sun, Feb 23, 12:56

prepro.py

from string import digits
from string import punctuation
from string import ascii_letters
import re
def preprocessing(name):
file = open(name,"r")
new_file = open('pre_'+name,"w")
file_array=file.read().splitlines()
i = 0
#do preprocessing
for item in file_array:
if(i%1000==0):
print(i,"/",len(file_array))
print(item)
item=delete_user(item)
item=delete_url(item)
item=removePunctuationSpaces(item)
item=removeMultipleLetters(item) #more than three?
item=removeDigits(item)
item=delete_points(item)
item=delete_weird_char(item)
item=separateHashtags(item)
item=hahaHandler(item)
i=i+1
new_file.write(item+'\n')
file.close()
new_file.close()
def delete_user(string):
return string.replace('<user>','')
def delete_url(string):
return string.replace('<url>','')
def delete_points(string):
string = re.sub(r'([a-zA-Z])\.([a-zA-Z])',r'\1 \1',string)
string= string.replace('.', '')
string = re.sub(r' +',' ', string)
return string
#string = re.sub(r' \.+ ', ' ', string )
#change points
def delete_weird_char(string):
string = re.sub(r' [^a-zA-Z()?!] ', ' ', string)
return string
# " , ; : ' + * / \ & % $ § ]
# Remove every digit in a string
def removeDigits(string):
remove_digits = str.maketrans('', '',digits)
res = string.translate(remove_digits)
return res
# Removes the spaces between punctuation signs (example: "^ . ^" -> "^.^")
def removePunctuationSpaces(string):
for i in punctuation:
for j in punctuation:
string = string.replace(i+' '+j, i+j)
return string
# Removes multiple occurrences of the same letter (example: "yeeeeaahhhh" -> "yeah")
def removeMultipleLetters(string):
string = re.sub(r'([a-zA-Z])\1+',r'\1',string)
return string
#for i in ascii_letters:
# while(string.count(i+i)>0):
# string = string.replace(i+i,i)
#return string
def separateHashtags(string):
string = re.sub(r'[^ ]#', ' #', string)
return string
# Detects the different versions of laughs ("hahaha", "jajaja", etc.) and and maps them to the same word (or several occurrences of the same word)
def hahaHandler(string):
string=hahaParseLeft(string)
string=hahaShorten(string) # We do it in case there are an odd number of "ha"/"ah"
string=hahaParseRight(string)
return string
# Transforms laughs ("haha") into same single version (example: "hahahaha" -> "haha")
def hahaShorten(string):
#string = re.sub("(ahah|haha|jaja|ajaj)+","haha",string)
string = re.sub("(ahah|haha|jaja|ajaj)+(a|h)*","haha",string)
return string
# Transforms a strong laugh ("hahahahahaha") into several small laughs by putting a space on the right of each occurrence of "haha"
# (example: "hahahahahaha" -> "haha haha haha", "hahahaha" -> "haha haha", "hahahahaha" -> "haha hahaha")
def hahaParseLeft(string):
string = re.sub("haha|ahah|jaja|ajaj"," haha",string)
return string
# Transforms a strong laugh ("hahahahahaha") into several small laughs by putting a space on the right of each occurrence of "haha"
# (example: "hahahahahaha" -> "haha haha haha", "hahahaha" -> "haha haha", "hahahahaha" -> "haha hahaha")
def hahaParseRight(string):
string = re.sub("haha|ahah|jaja|ajaj","haha ",string)
return string
#def ahCounter(string):
# sentence = string.split()
# aa = [x for x in sentence if (x.count("a")+x.count("h")) > 4]
# if(len(aa)>0):
# print(aa)

Event Timeline