File Metadata

Created: Sun, Feb 23, 12:56

prepro.py
View Options

	from string import digits
	from string import punctuation
	from string import ascii_letters
	import re

	def preprocessing(name):
	file = open(name,"r")
	new_file = open('pre_'+name,"w")
	file_array=file.read().splitlines()
	i = 0
	#do preprocessing
	for item in file_array:
	if(i%1000==0):
	print(i,"/",len(file_array))
	print(item)
	item=delete_user(item)
	item=delete_url(item)
	item=removePunctuationSpaces(item)
	item=removeMultipleLetters(item) #more than three?
	item=removeDigits(item)
	item=delete_points(item)
	item=delete_weird_char(item)
	item=separateHashtags(item)
	item=hahaHandler(item)

	i=i+1
	new_file.write(item+'\n')

	file.close()
	new_file.close()


	def delete_user(string):
	return string.replace('<user>','')

	def delete_url(string):
	return string.replace('<url>','')

	def delete_points(string):
	string = re.sub(r'([a-zA-Z])\.([a-zA-Z])',r'\1 \1',string)
	string= string.replace('.', '')
	string = re.sub(r' +',' ', string)
	return string
	#string = re.sub(r' \.+ ', ' ', string )
	#change points


	def delete_weird_char(string):
	string = re.sub(r' [^a-zA-Z()?!] ', ' ', string)
	return string
	# " , ; : ' + * / \ & % $ § ]



	# Remove every digit in a string
	def removeDigits(string):
	remove_digits = str.maketrans('', '',digits)
	res = string.translate(remove_digits)
	return res

	# Removes the spaces between punctuation signs (example: "^ . ^" -> "^.^")
	def removePunctuationSpaces(string):
	for i in punctuation:
	for j in punctuation:
	string = string.replace(i+' '+j, i+j)
	return string

	# Removes multiple occurrences of the same letter (example: "yeeeeaahhhh" -> "yeah")
	def removeMultipleLetters(string):
	string = re.sub(r'([a-zA-Z])\1+',r'\1',string)
	return string
	#for i in ascii_letters:
	# while(string.count(i+i)>0):
	# string = string.replace(i+i,i)
	#return string

	def separateHashtags(string):
	string = re.sub(r'[^ ]#', ' #', string)
	return string


	# Detects the different versions of laughs ("hahaha", "jajaja", etc.) and and maps them to the same word (or several occurrences of the same word)
	def hahaHandler(string):
	string=hahaParseLeft(string)
	string=hahaShorten(string) # We do it in case there are an odd number of "ha"/"ah"
	string=hahaParseRight(string)
	return string

	# Transforms laughs ("haha") into same single version (example: "hahahaha" -> "haha")
	def hahaShorten(string):
	#string = re.sub("(ahah\|haha\|jaja\|ajaj)+","haha",string)
	string = re.sub("(ahah\|haha\|jaja\|ajaj)+(a\|h)*","haha",string)
	return string

	# Transforms a strong laugh ("hahahahahaha") into several small laughs by putting a space on the right of each occurrence of "haha"
	# (example: "hahahahahaha" -> "haha haha haha", "hahahaha" -> "haha haha", "hahahahaha" -> "haha hahaha")
	def hahaParseLeft(string):
	string = re.sub("haha\|ahah\|jaja\|ajaj"," haha",string)
	return string

	# Transforms a strong laugh ("hahahahahaha") into several small laughs by putting a space on the right of each occurrence of "haha"
	# (example: "hahahahahaha" -> "haha haha haha", "hahahaha" -> "haha haha", "hahahahaha" -> "haha hahaha")
	def hahaParseRight(string):
	string = re.sub("haha\|ahah\|jaja\|ajaj","haha ",string)
	return string

	#def ahCounter(string):
	# sentence = string.split()
	# aa = [x for x in sentence if (x.count("a")+x.count("h")) > 4]
	# if(len(aa)>0):
	# print(aa)

prepro.py
No OneTemporary
Actions

File Metadata

prepro.py
View Options

Event Timeline

prepro.pyNo OneTemporaryActions

File Metadata

prepro.pyView Options

Event Timeline

prepro.py
No OneTemporary
Actions

prepro.py
View Options