Page MenuHomec4science

load_files.py
No OneTemporary

File Metadata

Created
Sat, May 4, 21:43

load_files.py

import os
import json
from abc import abstractmethod
class Dataset:
def __init__(self, home_path, finetune_path, data_path="./datasets/", split=0.9):
self.xtrain = None
self.ytrain = None
self.xeval = None
self.yeval = None
self.xtest = None
self.ytest = None
self.data_path = data_path
self.loc_data_path = None
self.finetune_path = finetune_path
self.home_path = home_path
self.split = split
self.keywords = ["<input>", "<answer>", "<find>", "<|endoftext|>"]
# Private methods
# protected functions
def _openJSON(self, dataset, end="", ext=".json"):
with open(dataset + end + ext, 'r') as fp:
tmp = json.load(fp)
return tmp
def _writeTXT(self, data, name):
with open(name + ".txt", 'w') as f:
f.write(data)
def _writeJSON(self, data, name):
with open(name + ".json", "w") as f:
json.dump(data, f)
@abstractmethod
def _format_data(self, end):
raise NotImplementedError("format_data has to be defined in each child class")
@abstractmethod
def _save_data(self, x, y):
raise NotImplementedError("format_data has to be defined in each child class")
# Public methods
def save_data(self, x, y, dir, end="", x_test=[], y_test=[], console=None):
os.chdir(self.home_path)
os.chdir(self.data_path)
if not os.path.exists(dir):
os.mkdir(dir)
os.chdir(dir)
x, y = self._save_data(x, y)
x_test, y_test = self._save_data(x_test, y_test)
print("The train set has {} entries".format(len(x)))
self._writeJSON(x, "x_train"+end)
self._writeJSON(y, "y_train" + end)
print("The test set has {} entries".format(len(x_test)))
self._writeJSON(x_test, "x_test" + end)
self._writeJSON(y_test, "y_test" + end)
if console is not None:
self._writeTXT(console, "console" + end)
def load_data(self, end="", dir=None):
os.chdir(self.home_path)
os.chdir(self.data_path)
if dir is not None:
os.chdir("./" + dir + "/")
self.loc_data_path = os.getcwd()
self.xtest = self._openJSON('x_test', end=end)
self.ytest = self._openJSON('y_test', end=end)
self.xtrain = self._openJSON('x_train', end=end)
self.ytrain = self._openJSON('y_train', end=end)
# split train in evaluation and train
idx_split = int(self.split*len(self.xtrain))
self.xeval = self.xtrain[idx_split:]
self.yeval = self.ytrain[idx_split:]
self.xtrain = self.xtrain[:idx_split]
self.ytrain = self.ytrain[:idx_split]
os.chdir(self.finetune_path)
self._format_data(end)
def get_train(self):
return [self.xtrain, self.ytrain]
def get_eval(self):
return [self.xeval, self.yeval]
def get_test(self):
return [self.xtest, self.ytest]
@abstractmethod
def print_example(self, set):
raise NotImplementedError("format_data has to be defined in each child class")
class DatasetBert(Dataset):
def __init__(self, home_path, data_path="./datasets/", split=0.9):
os.chdir(home_path)
os.chdir("./transformers/examples/")
os.chdir("./question-answering")
finetune_path = os.getcwd()
super().__init__(home_path, finetune_path, data_path, split)
# private functions
# protected functions
def _save_data(self, x, y):
for i, tmp in enumerate(x):
tmp = tmp.split("---")
text = tmp[1]
question = tmp[0]
x[i] = self.keywords[0] + \
text + \
self.keywords[2] + \
question + \
self.keywords[1]
for i, _ in enumerate(y):
y[i] += self.keywords[3] + "\n"
return x, y
def _format_data(self, end):
# extract the context, question and answer from the x and y set
def get_qca(xdata, ydata, i_qca):
i_len = len(self.keywords[0])
a_len = len(self.keywords[1])
f_len = len(self.keywords[2])
e_len = len(self.keywords[3])
f_start = xdata[i_qca].find(self.keywords[2])
context = xdata[i_qca][i_len:f_start]
question = xdata[i_qca][f_start + f_len:-a_len]
answer = ydata[i_qca][:-e_len - 1]
return context, question, answer
# generate a dictionary of questions
def gen_qa_set(xdata, ydata):
qa_set = {"data": {"question": [], "context": [], "answers": []}}
for i, _ in enumerate(xdata):
context, question, answer = get_qca(xdata, ydata, i)
qa_set["data"]["question"].append(question)
qa_set["data"]["context"].append(context)
qa_set["data"]["answers"].append({"answer_start": [context.find(answer)], "text": [answer]})
return qa_set
# generate the train and validation datasets
qa_train = gen_qa_set(self.xtrain, self.ytrain)
qa_val = gen_qa_set(self.xeval, self.yeval)
os.chdir(self.finetune_path)
self._writeJSON(qa_train, "train" + end)
self._writeJSON(qa_val, "eval" + end)
# modify the set structure
def add_sep(x, y):
for i, _ in enumerate(x):
c, q, a = get_qca(x, y, i)
x[i] = q + "---" + c
y[i] = a
add_sep(self.xtrain, self.ytrain)
add_sep(self.xeval, self.yeval)
add_sep(self.xtest, self.ytest)
self.print_example(qa_train)
# public functions
def print_example(self, set):
print("Questions:")
print(set["data"]["question"][:3])
print("Context:")
print(set["data"]["context"][:3])
print("Answers:")
print(set["data"]["answers"][:3])
class DatasetGPT(Dataset):
def __init__(self, home_path, data_path="./datasets/", split=0.9):
os.chdir(home_path)
os.chdir("./transformers/examples/")
os.chdir("./language-modeling")
finetune_path = os.getcwd()
super().__init__(home_path, finetune_path, data_path, split)
# private functions
# protected functions
def _save_data(self, x, y):
return x, y
def _format_data(self, end):
# combine the list entries to a single long string
train_str = ""
eval_str = ""
data = [self.xtrain, self.ytrain]
print("The dataset has {} entries".format(len(data[0])))
for i, _ in enumerate(data[0]):
train_str += data[0][i] # from x set
train_str += data[1][i] # from y set
data = [self.xeval, self.yeval]
print("The dataset has {} entries".format(len(data[0])))
for i, _ in enumerate(data[0]):
eval_str += data[0][i] # from x set
eval_str += data[1][i] # from y set
# save the training and evaluation files
os.chdir(self.finetune_path)
print("start saving files")
self._writeTXT(train_str, "train" + end)
self._writeTXT(eval_str, "eval" + end)
print("saved files")
# print an example
self.print_example(train_str)
# public functions
def print_example(self, set):
print(set[:5000])

Event Timeline