sentfeat.py
No OneTemporary
Actions

Subscribers

None

File Metadata


	import numpy as np

	files = ["pre_test_data.txt", "pre_train_neg_full.txt", "pre_train_pos_full.txt"]

	def sentence_features(name):

	sentences = open(name).read().splitlines()
	vocab = open("vocab_cut.txt","r").read().splitlines()
	print(len(vocab))
	word_vec = np.load("embeddings.npy")
	print(word_vec.shape)
	sentence_vec=np.zeros((len(sentences),word_vec.shape[1]))


	for i in range(len(sentences)):
	#get i-th sentnece
	temp_sent=sentences[i]
	#get all words (they are seperated by spaces)
	temp_seperated_words=temp_sent.split()
	temp_sent_vect=np.zeros(word_vec.shape[1])
	for j in range(len(temp_seperated_words)):
	temp_sent_vect=lookup_word_vektor(vocab,word_vec,temp_seperated_words[j],temp_sent_vect)

	sentence_vec[i] = temp_sent_vect #np.r_[sentence_vec,[temp_sent_vect]]#sentence_vec.concatenate(temp_sent_vect)
	if i%1000 == 0:
	print(str(i)+"/"+str(len(sentences)))
	return sentence_vec, name



	def lookup_word_vektor(vocab,word_vec,word,temp):
	if word in vocab:#vocab.contains(word):

	#print("the word: " + word)
	index=vocab.index(word)
	#print("the index: " + str(index))
	to_add=word_vec[index]
	#print("to add: ")
	#print(to_add)
	temp=temp+to_add
	#print(temp)

	return temp



	def save_sentence_feature():
	for filename in files:
	the_sentence_feature = sentence_features(filename)
	np.save(filename[:-4] +"_feature", the_sentence_feature)

	if __name__ == '__main__':
	save_sentence_feature()#ce_feature()