load_files.py
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Tue, Jul 15, 23:06

load_files.py
View Options

	import os
	import json

	from abc import abstractmethod

	class Dataset:
	def __init__(self, home_path, finetune_path, data_path="./datasets/", split=0.9):
	self.xtrain = None
	self.ytrain = None
	self.xeval = None
	self.yeval = None
	self.xtest = None
	self.ytest = None

	self.data_path = data_path
	self.loc_data_path = None
	self.finetune_path = finetune_path
	self.home_path = home_path

	self.split = split

	self.keywords = ["<input>", "<answer>", "<find>", "<\|endoftext\|>"]

	# Private methods

	# protected functions
	def _openJSON(self, dataset, end="", ext=".json"):
	with open(dataset + end + ext, 'r') as fp:
	tmp = json.load(fp)
	return tmp

	def _writeTXT(self, data, name):
	with open(name + ".txt", 'w') as f:
	f.write(data)

	def _writeJSON(self, data, name):
	with open(name + ".json", "w") as f:
	json.dump(data, f)

	@abstractmethod
	def _format_data(self, end):
	raise NotImplementedError("format_data has to be defined in each child class")

	@abstractmethod
	def _save_data(self, x, y):
	raise NotImplementedError("format_data has to be defined in each child class")

	# Public methods

	def save_data(self, x, y, dir, end="", x_test=[], y_test=[], console=None):
	os.chdir(self.home_path)
	os.chdir(self.data_path)
	if not os.path.exists(dir):
	os.mkdir(dir)
	os.chdir(dir)

	x, y = self._save_data(x, y)
	x_test, y_test = self._save_data(x_test, y_test)

	print("The train set has {} entries".format(len(x)))
	self._writeJSON(x, "x_train"+end)
	self._writeJSON(y, "y_train" + end)

	print("The test set has {} entries".format(len(x_test)))
	self._writeJSON(x_test, "x_test" + end)
	self._writeJSON(y_test, "y_test" + end)

	if console is not None:
	self._writeTXT(console, "console" + end)


	def load_data(self, end="", dir=None):
	os.chdir(self.home_path)
	os.chdir(self.data_path)
	if dir is not None:
	os.chdir("./" + dir + "/")
	self.loc_data_path = os.getcwd()

	self.xtest = self._openJSON('x_test', end=end)
	self.ytest = self._openJSON('y_test', end=end)
	self.xtrain = self._openJSON('x_train', end=end)
	self.ytrain = self._openJSON('y_train', end=end)

	# split train in evaluation and train
	idx_split = int(self.split*len(self.xtrain))
	self.xeval = self.xtrain[idx_split:]
	self.yeval = self.ytrain[idx_split:]
	self.xtrain = self.xtrain[:idx_split]
	self.ytrain = self.ytrain[:idx_split]

	os.chdir(self.finetune_path)
	self._format_data(end)

	def get_train(self):
	return [self.xtrain, self.ytrain]

	def get_eval(self):
	return [self.xeval, self.yeval]

	def get_test(self):
	return [self.xtest, self.ytest]

	@abstractmethod
	def print_example(self, set):
	raise NotImplementedError("format_data has to be defined in each child class")

	class DatasetBert(Dataset):
	def __init__(self, home_path, data_path="./datasets/", split=0.9):
	os.chdir(home_path)
	os.chdir("./transformers/examples/")
	os.chdir("./question-answering")
	finetune_path = os.getcwd()

	super().__init__(home_path, finetune_path, data_path, split)

	# private functions

	# protected functions
	def _save_data(self, x, y):
	for i, tmp in enumerate(x):
	tmp = tmp.split("---")
	text = tmp[1]
	question = tmp[0]

	x[i] = self.keywords[0] + \
	text + \
	self.keywords[2] + \
	question + \
	self.keywords[1]

	for i, _ in enumerate(y):
	y[i] += self.keywords[3] + "\n"

	return x, y

	def _format_data(self, end):
	# extract the context, question and answer from the x and y set
	def get_qca(xdata, ydata, i_qca):
	i_len = len(self.keywords[0])
	a_len = len(self.keywords[1])
	f_len = len(self.keywords[2])
	e_len = len(self.keywords[3])

	f_start = xdata[i_qca].find(self.keywords[2])
	context = xdata[i_qca][i_len:f_start]
	question = xdata[i_qca][f_start + f_len:-a_len]
	answer = ydata[i_qca][:-e_len - 1]

	return context, question, answer

	# generate a dictionary of questions
	def gen_qa_set(xdata, ydata):
	qa_set = {"data": {"question": [], "context": [], "answers": []}}

	for i, _ in enumerate(xdata):
	context, question, answer = get_qca(xdata, ydata, i)

	qa_set["data"]["question"].append(question)
	qa_set["data"]["context"].append(context)
	qa_set["data"]["answers"].append({"answer_start": [context.find(answer)], "text": [answer]})

	return qa_set

	# generate the train and validation datasets
	qa_train = gen_qa_set(self.xtrain, self.ytrain)
	qa_val = gen_qa_set(self.xeval, self.yeval)

	os.chdir(self.finetune_path)
	self._writeJSON(qa_train, "train" + end)
	self._writeJSON(qa_val, "eval" + end)

	# modify the set structure
	def add_sep(x, y):
	for i, _ in enumerate(x):
	c, q, a = get_qca(x, y, i)
	x[i] = q + "---" + c
	y[i] = a

	add_sep(self.xtrain, self.ytrain)
	add_sep(self.xeval, self.yeval)
	add_sep(self.xtest, self.ytest)

	self.print_example(qa_train)


	# public functions
	def print_example(self, set):
	print("Questions:")
	print(set["data"]["question"][:3])
	print("Context:")
	print(set["data"]["context"][:3])
	print("Answers:")
	print(set["data"]["answers"][:3])

	class DatasetGPT(Dataset):
	def __init__(self, home_path, data_path="./datasets/", split=0.9):
	os.chdir(home_path)
	os.chdir("./transformers/examples/")
	os.chdir("./language-modeling")
	finetune_path = os.getcwd()

	super().__init__(home_path, finetune_path, data_path, split)

	# private functions

	# protected functions
	def _save_data(self, x, y):
	return x, y

	def _format_data(self, end):
	# combine the list entries to a single long string
	train_str = ""
	eval_str = ""
	data = [self.xtrain, self.ytrain]
	print("The dataset has {} entries".format(len(data[0])))
	for i, _ in enumerate(data[0]):
	train_str += data[0][i] # from x set
	train_str += data[1][i] # from y set

	data = [self.xeval, self.yeval]
	print("The dataset has {} entries".format(len(data[0])))
	for i, _ in enumerate(data[0]):
	eval_str += data[0][i] # from x set
	eval_str += data[1][i] # from y set

	# save the training and evaluation files
	os.chdir(self.finetune_path)
	print("start saving files")
	self._writeTXT(train_str, "train" + end)
	self._writeTXT(eval_str, "eval" + end)
	print("saved files")

	# print an example
	self.print_example(train_str)

	# public functions
	def print_example(self, set):
	print(set[:5000])

load_files.pyNo OneTemporaryActions

File Metadata

load_files.pyView Options

Event Timeline

load_files.py
No OneTemporary
Actions

load_files.py
View Options