diff --git a/main_project/gpt2_colab/load_files.py b/main_project/gpt2_colab/load_files.py
index 1131c5d..f778cda 100644
--- a/main_project/gpt2_colab/load_files.py
+++ b/main_project/gpt2_colab/load_files.py
@@ -1,179 +1,179 @@
import os
import json
from abc import abstractmethod
class Dataset:
def __init__(self, home_path, finetune_path, data_path="./datasets/", split=0.9):
self.xtrain = None
self.ytrain = None
self.xeval = None
self.yeval = None
self.xtest = None
self.ytest = None
self.data_path = data_path
self.loc_data_path = None
self.finetune_path = finetune_path
self.home_path = home_path
self.split = split
# Private methods
# protected functions
def _openJSON(self, dataset, end="", ext=".json"):
with open(dataset + end + ext, 'r') as fp:
tmp = json.load(fp)
return tmp
def _writeTXT(self, data, name):
with open(name + ".txt", 'w') as f:
f.write(data)
def _writeJSON(self, data, name):
with open(name + ".json", "w") as f:
json.dump(data, f)
@abstractmethod
def _format_data(self, end):
raise NotImplementedError("format_data has to be defined in each child class")
# Public methods
def load_data(self, end="", dir=None):
os.chdir(self.home_path)
os.chdir(self.data_path)
if dir is not None:
os.chdir("./" + dir + "/")
self.loc_data_path = os.getcwd()
self.xtest = self._openJSON('x_test', end=end)
self.ytest = self._openJSON('y_test', end=end)
self.xtrain = self._openJSON('x_train', end=end)
self.ytrain = self._openJSON('y_train', end=end)
# split train in evaluation and train
idx_split = int(self.split*len(self.xtrain))
self.xeval = self.xtrain[idx_split:]
self.yeval = self.ytrain[idx_split:]
self.xtrain = self.xtrain[:idx_split]
self.ytrain = self.ytrain[:idx_split]
self._format_data(end)
def get_train(self):
return [self.xtrain, self.ytrain]
def get_eval(self):
return [self.xeval, self.yeval]
def get_test(self):
return [self.xtest, self.ytest]
@abstractmethod
def print_example(self, set):
raise NotImplementedError("format_data has to be defined in each child class")
class DatasetBert(Dataset):
def __init__(self, home_path, data_path="./datasets/", split=0.9):
os.chdir(home_path)
os.chdir("./transformers/examples/")
os.chdir("./question-answering")
finetune_path = os.getcwd()
super().__init__(home_path, finetune_path, data_path, split)
# private functions
# protected functions
def _format_data(self, end):
# extract the context, question and answer from the x and y set
def get_qca(xdata, ydata, i_qca):
i_len = len("")
a_len = len("")
f_len = len("")
e_len = len("<|endoftext|>")
f_start = xdata[i_qca].find("")
context = xdata[i_qca][i_len:f_start]
question = xdata[i_qca][f_start + f_len:-a_len]
answer = ydata[i_qca][:-e_len - 1]
return context, question, answer
# generate a dictionary of questions
def gen_qa_set(xdata, ydata):
qa_set = {"data": {"question": [], "context": [], "answers": []}}
for i, _ in enumerate(xdata):
context, question, answer = get_qca(xdata, ydata, i)
qa_set["data"]["question"].append(question)
qa_set["data"]["context"].append(context)
qa_set["data"]["answers"].append({"answer_start": [context.find(answer)], "text": [answer]})
return qa_set
# generate the train and validation datasets
qa_train = gen_qa_set(self.xtrain, self.ytrain)
qa_val = gen_qa_set(self.xeval, self.yeval)
self._writeJSON(qa_train, "train" + end)
self._writeJSON(qa_val, "eval" + end)
+ # modify the set structure
+ def add_sep(x, y):
+ for i, _ in enumerate(x):
+ q, c, a = get_qca(x, y, i)
+ x[i] = q + "---" + c
+ y[i] = a
+
+ add_sep(self.xtrain, self.ytrain)
+ add_sep(self.xeval, self.yeval)
+ add_sep(self.xtest, self.ytest)
+
self.print_example(qa_train)
# public functions
def print_example(self, set):
print("Questions:")
print(set["data"]["question"][:3])
print("Context:")
print(set["data"]["context"][:3])
print("Answers:")
print(set["data"]["answers"][:3])
class DatasetGPT(Dataset):
def __init__(self, home_path, data_path="./datasets/", split=0.9):
os.chdir(home_path)
os.chdir("./transformers/examples/")
os.chdir("./language-modeling")
finetune_path = os.getcwd()
super().__init__(home_path, finetune_path, data_path, split)
# private functions
# protected functions
- def _format_data(self, x_test, y_test, x_train, y_train, end):
+ def _format_data(self, end):
# combine the list entries to a single long string
comb_string = ["", ""] # train and eval set
for ind, data in enumerate([[self.xtrain, self.ytrain], [self.xeval, self.yeval]]):
for i, _ in enumerate(data):
comb_string[ind] += data[0][i] # from x set
comb_string[ind] += data[1][i] # from y set
# save the training and evaluation files
os.chdir(self.finetune_path)
self._writeTXT(comb_string[0], "train" + end)
self._writeTXT(comb_string[1], "eval" + end)
# print an example
self.print_example(comb_string[0])
- # modify the set structure
- def add_sep(x, y):
- for i, _ in enumerate(x):
- q, c, a = get_qca(x, y, i)
- x[i] = q + "---" + c
- y[i] = a
-
- add_sep(self.xtrain, self.ytrain)
- add_sep(self.xeval, self.yeval)
- add_sep(self.xtest, self.ytest)
-
# public functions
def print_example(self, set):
print(set[:5000])
\ No newline at end of file
diff --git a/main_project/gpt2_colab/main.py b/main_project/gpt2_colab/main.py
index 9e16cba..a336c4c 100644
--- a/main_project/gpt2_colab/main.py
+++ b/main_project/gpt2_colab/main.py
@@ -1,368 +1,387 @@
import os
import json
from transformers.generation_tf_utils import TFGenerationMixin
import numpy as np
import tensorflow as tf
import torch
import load_files as lf
import model as mo
# choose the model type
-MODEL = "bert" # gpt, bert
+MODEL = "gpt" # gpt, bert
# train the model from scratch or from a checkpoint
SCRATCH = False # True, False
# choose the checkpoint
-CHECKPOINT = "xlm-roberta-base" # xlm-roberta-base, gpt2, None
+CHECKPOINT = "gpt2" # xlm-roberta-base, gpt2, None
# train the model in this run
-TRAIN = True # True, False
+TRAIN = False # True, False
# number of training epochs
EPOCHS = 1
# new model name
-NAME = "xlmr-e1-test"
+NAME = "gpt_e_1" # xmlr_e_1_test,
home_path = os.getcwd()
# load a dataset
-if MODEL == "gpt2":
+if MODEL == "gpt":
print("Start instances for a gpt2 model")
dataset = lf.DatasetGPT(home_path)
- model = mo.GPTModel(home_path)
+ model = mo.GPTModel(home_path, printStep=-1)
elif MODEL == "bert":
print("Start instances for a bert model")
dataset = lf.DatasetBert(home_path)
- model = mo.BertModel(home_path)
+ model = mo.BertModel(home_path, printStep=-1)
else:
NotImplementedError("Model type not defined")
# load a dataset
dataset.load_data()
# train a tokenizer from scratch
if SCRATCH:
NotImplementedError("Train a tokenizer")
SCRATCH = "loc of tok"
else:
SCRATCH = None
if TRAIN:
model.train(nbEpochs=EPOCHS,
outModelName=NAME,
startCheckpoint=CHECKPOINT,
dataEnd="",
tokenizerLocaction=SCRATCH)
# load the model
model.load_model(NAME)
+model.generate(dataset.get_test(), 0, 100)
+
+print("{} Test on CH dataset {}".format("="*100, "="*100))
+dataset.load_data(dir="ch_full")
+model.generate(dataset.get_test(), 0, 100)
+
+print("{} Test on FR dataset {}".format("="*100, "="*100))
+dataset.load_data(dir="fr_full_surname", end="_fr")
+model.generate(dataset.get_test(), 0, 100)
+
+print("{} Test on FI dataset {}".format("="*100, "="*100))
+dataset.load_data(dir="fi_full_surname", end="_fi")
+model.generate(dataset.get_test(), 0, 100)
+
+print("{} Test on FI dataset with Finnish names {}".format("="*100, "="*100))
+dataset.load_data(dir="fi_full_surname_fi_names", end="_fi")
+model.generate(dataset.get_test(), 0, 100)
+
+
"""
# modify the hugginface code
TFGenerationMixin._generate_no_beam_search = mod_gpt._generate_no_beam_search_modif
TFGenerationMixin.generate = mod_gpt.generate_modif
def calc_perf(lim, is_lim, corr_answer, given_answer, corr2):
for i, l in enumerate(lim):
# calculate true positive, ect
if is_lim > l:
# confidence over treshold
if corr_answer == given_answer:
corr2[i,0] += 1
else:
corr2[i,1] += 1
else:
if corr_answer == given_answer:
corr2[i,2] += 1
else:
corr2[i,3] += 1
return corr2
def print_perf(corr2, lim, console_out, samples):
print(samples)
print(corr2[0])
# true positive ect table
tmp = "lim - true pos - false pos - false neg - true neg\n"
for i, l in enumerate(lim):
tmp += str(l) + " - "
for v in corr2[i]:
tmp += str(int(100 / samples * v)) + "% - "
# tmp -= "- "
tmp += "\n"
print(tmp)
console_out += tmp + "\n"
# true positive ect table
tmp = "lim - F1 - recall - precision \n"
for i, l in enumerate(lim):
tmp += str(l) + " - "
recall = corr2[i,0] / (corr2[i,0] + corr2[i,2])
precision = corr2[i,0] / (corr2[i,0] + corr2[i,1])
F1 = 2* precision * recall / (precision + recall)
tmp += str(F1) + " - "
tmp += str(recall) + " - "
tmp += str(precision)
tmp += "\n"
print(tmp)
console_out += tmp + "\n"
return console_out
def score(x, y, name="model_small", step=10, stop=100, console_name=""):
console_out = ""
samples = 0
samples_c = 0
samples_pc = 0
x_sure = []
y_sure = []
x_unsure = []
y_unsure = []
eps = 0.01
lim = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
count = [eps, eps, eps, eps, eps, eps, eps, eps, eps]
corr = [0, 0, 0, 0, 0, 0, 0, 0, 0]
corr2 = np.zeros((len(lim), 4))
if MODEL == "gpt2":
model = mo.get_model(model_path, name)
if SCRATCH:
tokenizer = tok.get_tok(tok_path)
else:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
for ind, start_ in enumerate(x):
if ind > stop:
break
input_ids = tokenizer.encode(start_, return_tensors='tf')
generated_text_samples = model.generate(
input_ids,
max_length=len(input_ids[0]) + 50,
num_return_sequences=1,
no_repeat_ngram_size=0,
repetition_penalty=1.0,
top_p=1.0,
temperature=1.0,
do_sample=False,
top_k=0,
early_stopping=True,
tokenizer=tokenizer,
VERBOSE=VERBOSE
)
corr_answer = y[ind][:-len("<|endoftext|>") - 1]
given_answer = generated_text_samples[0]
# answer correct with high proba
if generated_text_samples[1] > 0.8:
x_sure.append(start_)
y_sure.append(given_answer + "<|endoftext|>")
else:
x_unsure.append(start_)
y_unsure.append("<|endoftext|>")
os.chdir(curr_data_path)
for data_set in ["train", "test"]:
with open("x_"+data_set+"_sure.json", 'w') as fp:
json.dump(x_sure, fp)
with open("y_"+data_set+"_sure.json", 'w') as fp:
json.dump(y_sure, fp)
with open("x_"+data_set+"_unsure.json", 'w') as fp:
json.dump(x_unsure, fp)
with open("y_"+data_set+"_unsure.json", 'w') as fp:
json.dump(y_unsure, fp)
samples += 1
for i, l in enumerate(lim):
if generated_text_samples[1] > l:
if count[i] > eps:
count[i] += 1
else:
count[i] = 1
corr2 = calc_perf(lim, generated_text_samples[1], corr_answer, given_answer, corr2)
if corr_answer == given_answer:
samples_c += 1
samples_pc += 1
for i, l in enumerate(lim):
if generated_text_samples[1] > l:
corr[i] += 1
elif given_answer in corr_answer:
samples_pc += 1
if ind % step == 0 or ind == stop:
tmp="Score: {}, {}, {}".format(samples, samples_c, samples_pc)
print(tmp)
console_out += tmp + "\n"
tmp="lim : {} \n prob: {} \n used: {}".format(lim,
[int(100 / count[i] * corr[i]) for i, _ in enumerate(lim)],
[int(100 / samples * count[i]) for i, _ in enumerate(lim)])
print(tmp)
console_out += tmp + "\n"
tmp = "{} / {} ".format(corr_answer, given_answer)
print(tmp)
console_out += tmp + "\n"
console_out = print_perf(corr2, lim, console_out, samples)
os.chdir(curr_data_path)
with open("console_out" + console_name + ".txt", 'w') as f:
f.write(console_out)
# create a new dataset with good answers
return samples, samples_c, samples_pc
def score_xmlr(x, y, name="model_small", step=10, stop=100, console_name=""):
console_out = ""
samples = 0
samples_c = 0
samples_pc = 0
eps = 0.01
lim = [x/10 for x in range(10)]
# lim = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
# count = np.zeros((len(lim),))
# count += eps
# count = [eps, eps, eps, eps, eps, eps, eps, eps, eps]
corr2 = np.zeros((len(lim), 4))
# tmp = [0, 0, 0, 0]
# corr2 = [tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp]
if MODEL == "xmlr":
model, tokenizer = mox.get_model(model_path, name)
for ind, start_ in enumerate(x):
if ind > stop:
break
tmp = x[ind].split("---")
text = tmp[1]
question = tmp[0]
corr_answer = y[ind]
inputs = tokenizer(question, text, return_tensors='pt')
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
outputs = model(**inputs)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits
a = torch.argmax(start_scores)
b = torch.argmax(end_scores)
a = int(a)
b = int(b)
probs_a = tf.nn.softmax(start_scores.detach())
probs_b = tf.nn.softmax(end_scores.detach())
prob_a = probs_a[0, a]
prob_b = probs_b[0, b]
prob_ab = prob_a*prob_b
print("a = {} with {} %, b = {} with {} %, combo {}".format(a, prob_a, b, prob_b, prob_ab))
# answer => 'a nice puppet'
print("correct answer: " + corr_answer)
given_answer = tokenizer.decode(inputs['input_ids'][0][int(a):int(b) + 1])
print("given answer: " + given_answer)
samples += 1
corr2 = calc_perf(lim, prob_ab, corr_answer, given_answer, corr2)
if given_answer[0] == " ":
given_answer = given_answer[1:]
if corr_answer == given_answer:
samples_c += 1
samples_pc += 1
elif given_answer in corr_answer:
samples_pc += 1
if ind % step == 0 or ind == stop:
tmp = "Score: {}, {}, {}".format(samples, samples_c, samples_pc)
print(tmp)
console_out += tmp + "\n"
tmp = "{} / {} ".format(corr_answer, given_answer)
print(tmp)
console_out += tmp + "\n"
console_out = print_perf(corr2, lim, console_out, samples)
os.chdir(curr_data_path)
with open("console_out" + console_name + ".txt", 'w') as f:
f.write(console_out)
# create a new dataset with good answers
return samples, samples_c, samples_pc
if False:
score_xmlr(x_test, y_test, name=name, step=10, stop=100)
if True:
VERBOSE = "nothing_but_score"
score(x_test, y_test, name=name, step=10, stop=100)
if False:
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, dir="ch_full")
score(x_test, y_test, model=name, step=1, stop=1)
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, end="_fr", dir="fr_full_surname")
score(x_test, y_test, model=name, step=1, stop=1)
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, end="_fi", dir="fi_full_surname")
score(x_test, y_test, name=name, step=5, stop=100, console_name="_on_ch")
name = "gpt_e_1_adapt"
start = "gpt_e_1"
_, _, _, _, _ = lf.load_data(home_path, gpt_path, end="_sure", dir="fi_full_surname")
os.chdir(gpt_path)
mo.train(model_path, 2, name, end="_sure", start=start, tok_loc = tok_loc)
score(x_test[100:], y_test[100:], name=name, step=5, stop=100, console_name="_on_ch_fi")
name = "gpt_e_1_adapt2"
start = "gpt_e_1_adapt"
_, _, _, _, _ = lf.load_data(home_path, gpt_path, end="_sure", dir="fi_full_surname")
os.chdir(gpt_path)
mo.train(model_path, 2, name, end="_sure", start=start, tok_loc = tok_loc)
score(x_test[200:], y_test[200:], name=name, step=5, stop=100, console_name="_on_ch_fi_fi")
"""
\ No newline at end of file
diff --git a/main_project/gpt2_colab/model.py b/main_project/gpt2_colab/model.py
index c441efa..b35be0f 100644
--- a/main_project/gpt2_colab/model.py
+++ b/main_project/gpt2_colab/model.py
@@ -1,121 +1,358 @@
import os
from transformers import TFGPT2LMHeadModel
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from abc import abstractmethod
+import numpy as np
+import tensorflow as tf
+import torch
+
+from transformers.generation_tf_utils import TFGenerationMixin
+import modif_gpt as mod_gpt
+
+
class Model:
- def __init__(self, home_path, finetune_path, model_path="./models/"):
+ def __init__(self, home_path, finetune_path, model_path, printStep, buckets):
+ os.chdir(home_path)
+ os.chdir(model_path)
+ model_path = os.getcwd()
+
+ self.home_path = home_path
self.home_path = home_path
self.finetune_path = finetune_path
self.model_path = model_path
self.model = None
self.tokenizer = None
+ self.buckets = buckets
+ self.lim = [x/self.buckets for x in range(self.buckets)]
+ self.count = np.zeros((self.buckets,))
+ self.score = {
+ "tp": np.zeros((self.buckets,)),
+ "fp": np.zeros((self.buckets,)),
+ "fn": np.zeros((self.buckets,)),
+ "tn": np.zeros((self.buckets,)),
+
+ "f1": np.zeros((self.buckets,)),
+ "recall": np.zeros((self.buckets,)),
+ "precision": np.zeros((self.buckets,)),
+ }
+
+ self.printStep = printStep
+
+ self.sureLim = 0.8
+ self.xsure = []
+ self.ysure = []
+
# private functions
+ def __printScores(self, hist=True):
+ def nbTolines(nb1, nb2):
+ return std_len("|"*int(100/nb2*nb1), l=100, side="right", fill=".")
+
+ def std_len(s, l=8, side="left", fill=" "):
+ s = str(s)
+ while(len(s) < l):
+ if side == "left":
+ s = fill + s
+ else:
+ s += fill
+ return s
+
+ def doHist(tmp, var):
+ if i == self.buckets - 1:
+ next_bin = 0
+ else:
+ next_bin = var[i + 1]
+ tmp += " - {}".format(
+ nbTolines(var[i]-next_bin, var[0])
+ )
+ return tmp
+
+ print("Number of exact matches, for each ConfidenceScore threshold")
+ print("trs - used cases [%] - ConfidenceScore distribution")
+ tmp = ""
+ for i in range(self.buckets):
+ tmp += "{} - {}%".format(
+ std_len(self.lim[i]),
+ std_len(int(100/self.count[0]*self.count[i]))
+ )
+ if hist:
+ tmp = doHist(tmp, self.count)
+ tmp += "\n"
+
+ print(tmp)
+ print("\n {} \n".format("="*100))
+
+ print("trs - true positive [%] - false positive [%] - false negative [%] - true negative [%]")
+ tmp = ""
+ for i in range(self.buckets):
+ tmp += "{} - {}% - {}% - {}% - {}%\n".format(
+ std_len(self.lim[i]),
+ std_len(int(100/self.count[i]*self.score["tp"][i])),
+ std_len(int(100 / self.count[i] * self.score["fp"][i])),
+ std_len(int(100 / self.count[i] * self.score["fn"][i])),
+ std_len(int(100 / self.count[i] * self.score["tn"][i]))
+ )
+ print(tmp)
+ print("\n {} \n".format("="*100))
+
+ print("trs - recall - precision - F1 - F1 histogram")
+ tmp = ""
+ for i in range(self.buckets):
+ tmp += "{} - {} - {} - {}".format(
+ std_len(self.lim[i], 2),
+ std_len(round(self.score["recall"][i], 2)),
+ std_len(round(self.score["precision"][i], 2)),
+ std_len(round(self.score["f1"][i], 2))
+ )
+ if hist:
+ tmp += " - " + nbTolines(self.score["f1"][i], 1)
+ tmp += "\n"
+
+ print(tmp)
+ print("\n {} \n".format("=" * 100))
+
+ def __updateF1(self):
+ for i in range(self.buckets):
+ self.score["recall"][i] = \
+ self.score["tp"][i]/(self.score["tp"][i] + self.score["fn"][i])
+ self.score["precision"][i] = \
+ self.score["tp"][i]/(self.score["tp"][i] + self.score["fp"][i])
+
+ self.score["f1"][i] = \
+ 2 * self.score["precision"][i]*self.score["recall"][i] / \
+ (self.score["precision"][i] + self.score["recall"][i])
+
+ def __check_answer(self, confidenceScore, corrAnswer, givenAnswer):
+ for i, l in enumerate(self.lim):
+ # calculate true positive, ect
+ if confidenceScore > l:
+ self.count[i] += 1
+
+ if corrAnswer == givenAnswer:
+ self.score["tp"][i] += 1
+ else:
+ self.score["fp"][i] += 1
+ else:
+ if corrAnswer == givenAnswer:
+ self.score["fn"][i] += 1
+ else:
+ self.score["tn"][i] += 1
+
+ # update recall, precission and F1 score
+ self.__updateF1()
# protected functions
@abstractmethod
def _train(self, nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction):
raise NotImplementedError("__train has to be defined in each child class")
+ @abstractmethod
+ def _generate(self, x):
+ raise NotImplementedError("_generate has to be defined in each child class")
+
+
# public functions
+ def getSureGuesses(self):
+ return [self.xsure, self.ysure]
+
def train(self, nbEpochs, outModelName, startCheckpoint=None, dataEnd="", tokenizerLocaction=None):
print("train ...")
if startCheckpoint == None:
print("... from scratch, with the Tokenizer in {}".format(tokenizerLocaction))
+ raise NotImplementedError("Training from Scratch is not implemented yet")
else:
tokenizerLocaction = startCheckpoint
cmd = self._train(nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction)
print(cmd)
os.chdir(self.finetune_path)
os.system(cmd)
+ def generate(self, data, idxStart=0, idx_End=None):
+ if idx_End is None:
+ idx_End = len(data[0])
+ X = data[0][idxStart:idx_End]
+ Y = data[1][idxStart:idx_End]
+
+ for i, x in enumerate(X):
+ givenAnswer, confScore = self._generate(x)
+ corrAnswer = Y[i]
+
+ self.__check_answer(confScore, corrAnswer, givenAnswer)
+
+ if confScore > self.sureLim:
+ self.xsure.append(x)
+ self.ysure.append(Y[i])
+
+ if self.printStep > 0 and (i+1) % self.printStep == 0:
+ print("({}): Correct Answer / given Answer \n{} / {}".format(
+ i, corrAnswer, givenAnswer
+ ))
+ self.__printScores()
+
+ print("Final scores")
+ self.__printScores()
+
@abstractmethod
def load_model(self, ModelName):
raise NotImplementedError("load_model has to be defined in each child class")
class GPTModel(Model):
- def __init__(self, home_path, model_path="./models/"):
+ def __init__(self, home_path, model_path="./models", probaMode = "longOk", printStep=10, buckets=10):
os.chdir(home_path)
os.chdir("./transformers/examples/")
os.chdir("./language-modeling")
finetune_path = os.getcwd()
- super().__init__(home_path, finetune_path, model_path)
+ super().__init__(home_path, finetune_path, model_path, printStep, buckets)
+
+ self.probaMode = probaMode
def _train(self, nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction):
cmd = "python run_clm.py \
--model_type {} \
--train_file \"{}\" \
--tokenizer_name {} \
--do_train \
--validation_file \"{}\" \
--do_eval \
--per_gpu_train_batch_size 1 \
--save_steps -1 \
--num_train_epochs \"{}\" \
--fp16 \
--output_dir=\"{}\" \
".format(
"gpt2",
"train" + dataEnd + ".txt",
tokenizerLocaction,
"eval" + dataEnd + ".txt",
nbEpochs,
self.model_path + "/" + outModelName)
if startCheckpoint is not None:
if startCheckpoint not in ["gpt2"]:
startCheckpoint = self.model_path + "/" + startCheckpoint
cmd += " --model_name_or_path {}".format(startCheckpoint)
return cmd
+ def __set_modif_gpt(self):
+ TFGenerationMixin._generate_no_beam_search = mod_gpt._generate_no_beam_search_modif
+ TFGenerationMixin.generate = mod_gpt.generate_modif
+
+ def _generate(self, x):
+ self.__set_modif_gpt()
+ input_ids = self.tokenizer.encode(x, return_tensors='tf')
+
+ VERBOSE = "nothing_but_score"
+ VERBOSE = "nothing"
+
+ generated_text_samples = self.model.generate(
+ input_ids,
+ max_length=len(input_ids[0]) + 50,
+ num_return_sequences=1,
+ no_repeat_ngram_size=0,
+ repetition_penalty=1.0,
+ top_p=1.0,
+ temperature=1.0,
+ do_sample=False,
+ top_k=0,
+ early_stopping=True,
+ tokenizer=self.tokenizer,
+ VERBOSE=VERBOSE,
+ probaMode=self.probaMode
+ )
+
+ givenAnswer = generated_text_samples[0]
+
+ return givenAnswer, generated_text_samples[1]
+
+ def set_proba_mode(self, mode):
+ if mode in ["mult", "longOk"]:
+ self.probaMode = mode
+ else:
+ raise NotImplementedError("This probability mode is not yet implemented")
+
def load_model(self, ModelName):
- self.model = TFGPT2LMHeadModel.from_pretrained(model_path + "/" + name, from_pt=True)
- self.tokenizer = AutoTokenizer.from_pretrained(model_path + "/" + name)
+ self.model = TFGPT2LMHeadModel.from_pretrained(self.model_path + "/" + ModelName, from_pt=True)
+ self.tokenizer = AutoTokenizer.from_pretrained(self.model_path + "/" + ModelName)
class BertModel(Model):
- def __init__(self, home_path, model_path="./models/"):
+ def __init__(self, home_path, model_path="./models", printStep=10, buckets=10):
os.chdir(home_path)
os.chdir("./transformers/examples/")
os.chdir("./question-answering")
finetune_path = os.getcwd()
- super().__init__(home_path, finetune_path, model_path)
+ super().__init__(home_path, finetune_path, model_path, printStep, buckets)
# private functions
def _train(self, nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction):
cmd = "python run_qa.py \
--train_file \"{}\" \
--do_train \
--num_train_epochs \"{}\" \
--output_dir=\"{}\" \
--fp16 \
--save_steps -1 \
".format(
"train" + dataEnd + ".json",
nbEpochs,
self.model_path + "/" + outModelName)
# --validation_file \"{}\" \
# --do_eval \
# "eval" + end + ".json"
if startCheckpoint is not None:
if startCheckpoint not in ["xlm-roberta-base", "roberta-base"]:
startCheckpoint = self.model_path + "/" + startCheckpoint
cmd += " --model_name_or_path=\"{}\"".format(startCheckpoint)
return cmd
+ def _generate(self, x):
+ tmp = x.split("---")
+ text = tmp[1]
+ question = tmp[0]
+
+ # tokenize model input
+ inputs = self.tokenizer(question, text, return_tensors='pt')
+
+ # generate network output
+ outputs = self.model(**inputs)
+ start_scores = outputs.start_logits
+ end_scores = outputs.end_logits
+
+ # find start and end of the answer
+ a = torch.argmax(start_scores)
+ b = torch.argmax(end_scores)
+ a = int(a)
+ b = int(b)
+
+ # get the probability of the answer
+ probs_a = tf.nn.softmax(start_scores.detach())
+ probs_b = tf.nn.softmax(end_scores.detach())
+
+ prob_a = probs_a[0, a]
+ prob_b = probs_b[0, b]
+ prob_ab = prob_a * prob_b
+
+ givenAnswer = self.tokenizer.decode(inputs['input_ids'][0][int(a):int(b) + 1])
+
+ # due to the tokenizer the answer often starts with a blank space, which is not part of the answer
+ if len(givenAnswer) > 0 and givenAnswer[0] == " ":
+ givenAnswer = givenAnswer[1:]
+
+ return givenAnswer, prob_ab
+
# public functions
def load_model(self, ModelName):
self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_path+"/"+ModelName)
self.tokenizer = AutoTokenizer.from_pretrained(self.model_path + "/" + ModelName)
\ No newline at end of file
diff --git a/main_project/gpt2_colab/modif_gpt.py b/main_project/gpt2_colab/modif_gpt.py
index 9c0aa24..3d90ad5 100644
--- a/main_project/gpt2_colab/modif_gpt.py
+++ b/main_project/gpt2_colab/modif_gpt.py
@@ -1,571 +1,558 @@
import matplotlib.pyplot as plt
import transformers.generation_tf_utils as ge
import numpy as np
import tensorflow as tf
def generate_modif(
self,
input_ids=None,
max_length=None,
min_length=None,
do_sample=None,
early_stopping=None,
num_beams=None,
temperature=None,
top_k=None,
top_p=None,
repetition_penalty=None,
bad_words_ids=None,
bos_token_id=None,
pad_token_id=None,
eos_token_id=None,
length_penalty=None,
no_repeat_ngram_size=None,
num_return_sequences=None,
attention_mask=None,
decoder_start_token_id=None,
use_cache=None,
forced_bos_token_id=None,
forced_eos_token_id=None,
tokenizer=None,
VERBOSE=None,
+ probaMode=None,
):
# We cannot generate if the model does not have a LM head
if self.get_output_embeddings() is None:
raise AttributeError(
"You tried to generate sequences with a model that does not have a LM Head."
"Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)"
)
max_length = max_length if max_length is not None else self.config.max_length
min_length = min_length if min_length is not None else self.config.min_length
do_sample = do_sample if do_sample is not None else self.config.do_sample
early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
num_beams = num_beams if num_beams is not None else self.config.num_beams
temperature = temperature if temperature is not None else self.config.temperature
top_k = top_k if top_k is not None else self.config.top_k
top_p = top_p if top_p is not None else self.config.top_p
repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
no_repeat_ngram_size = (
no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
)
bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
num_return_sequences = (
num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
)
decoder_start_token_id = (
decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
)
forced_bos_token_id = (
forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id
)
forced_eos_token_id = (
forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id
)
if input_ids is not None:
batch_size = ge.shape_list(input_ids)[0] # overridden by the input batch_size
else:
batch_size = 1
assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
assert temperature > 0, "`temperature` should be strictly positive."
assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
assert input_ids is not None or (
isinstance(bos_token_id, int) and bos_token_id >= 0
), "If input_ids is not defined, `bos_token_id` should be a positive integer."
assert pad_token_id is None or (
isinstance(pad_token_id, int) and (pad_token_id >= 0)
), "`pad_token_id` should be a positive integer."
assert (eos_token_id is None) or (
isinstance(eos_token_id, int) and (eos_token_id >= 0)
), "`eos_token_id` should be a positive integer."
assert length_penalty > 0, "`length_penalty` should be strictly positive."
assert (
isinstance(num_return_sequences, int) and num_return_sequences > 0
), "`num_return_sequences` should be a strictly positive integer."
assert (
bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
if input_ids is None:
assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
"you should either supply a context to complete as `input_ids` input "
"or a `bos_token_id` (integer >= 0) as a first token to start the generation."
)
input_ids = tf.fill((batch_size, 1), bos_token_id)
else:
assert len(ge.shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)."
# not allow to duplicate outputs when greedy decoding
if do_sample is False:
if num_beams == 1:
# no_beam_search greedy generation conditions
assert (
num_return_sequences == 1
), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
else:
# beam_search greedy generation conditions
assert (
num_beams >= num_return_sequences
), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
# create attention mask if necessary
# TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()):
attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32)
elif attention_mask is None:
attention_mask = tf.ones_like(input_ids)
if pad_token_id is None and eos_token_id is not None:
ge.logger.warning(
"Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id)
)
pad_token_id = eos_token_id
# current position and vocab size
cur_len = ge.shape_list(input_ids)[1] # unused
vocab_size = self.config.vocab_size
# set effective batch size and effective batch multiplier according to do_sample
if do_sample:
effective_batch_size = batch_size * num_return_sequences
effective_batch_mult = num_return_sequences
else:
effective_batch_size = batch_size
effective_batch_mult = 1
if self.config.is_encoder_decoder:
if decoder_start_token_id is None:
decoder_start_token_id = bos_token_id
assert (
decoder_start_token_id is not None
), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
# get encoder and store encoder outputs
encoder = self.get_encoder()
encoder_outputs = encoder(input_ids, attention_mask=attention_mask)
# Expand input ids if num_beams > 1 or num_return_sequences > 1
if num_return_sequences > 1 or num_beams > 1:
input_ids_len = ge.shape_list(input_ids)[-1]
input_ids = tf.broadcast_to(
tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
)
attention_mask = tf.broadcast_to(
tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len)
)
input_ids = tf.reshape(
input_ids, (effective_batch_size * num_beams, input_ids_len)
) # shape: (batch_size * num_return_sequences * num_beams, cur_len)
attention_mask = tf.reshape(
attention_mask, (effective_batch_size * num_beams, input_ids_len)
) # shape: (batch_size * num_return_sequences * num_beams, cur_len)
if self.config.is_encoder_decoder:
# create empty decoder_input_ids
input_ids = (
tf.ones(
(effective_batch_size * num_beams, 1),
dtype=tf.int32,
)
* decoder_start_token_id
)
cur_len = 1
assert (
batch_size == encoder_outputs[0].shape[0]
), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} "
# expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1)
expanded_batch_idxs = tf.reshape(
tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1),
shape=(-1,),
)
# expand encoder_outputs
encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),)
else:
encoder_outputs = None
cur_len = ge.shape_list(input_ids)[-1]
assert (
cur_len < max_length
), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
- if num_beams > 1:
- output = self._generate_beam_search(
- input_ids,
- cur_len=cur_len,
- max_length=max_length,
- min_length=min_length,
- do_sample=do_sample,
- early_stopping=early_stopping,
- temperature=temperature,
- top_k=top_k,
- top_p=top_p,
- repetition_penalty=repetition_penalty,
- no_repeat_ngram_size=no_repeat_ngram_size,
- bad_words_ids=bad_words_ids,
- pad_token_id=pad_token_id,
- eos_token_id=eos_token_id,
- batch_size=effective_batch_size,
- num_return_sequences=num_return_sequences,
- length_penalty=length_penalty,
- num_beams=num_beams,
- vocab_size=vocab_size,
- encoder_outputs=encoder_outputs,
- attention_mask=attention_mask,
- use_cache=use_cache,
- forced_bos_token_id=forced_bos_token_id,
- forced_eos_token_id=forced_eos_token_id,
- )
- else:
- output = self._generate_no_beam_search(
- input_ids,
- cur_len=cur_len,
- max_length=max_length,
- min_length=min_length,
- do_sample=do_sample,
- temperature=temperature,
- top_k=top_k,
- top_p=top_p,
- repetition_penalty=repetition_penalty,
- no_repeat_ngram_size=no_repeat_ngram_size,
- bad_words_ids=bad_words_ids,
- pad_token_id=pad_token_id,
- eos_token_id=eos_token_id,
- batch_size=effective_batch_size,
- vocab_size=vocab_size,
- encoder_outputs=encoder_outputs,
- attention_mask=attention_mask,
- use_cache=use_cache,
- tokenizer=tokenizer,
- VERBOSE=VERBOSE,
- )
+ output = self._generate_no_beam_search(
+ input_ids,
+ cur_len=cur_len,
+ max_length=max_length,
+ min_length=min_length,
+ do_sample=do_sample,
+ temperature=temperature,
+ top_k=top_k,
+ top_p=top_p,
+ repetition_penalty=repetition_penalty,
+ no_repeat_ngram_size=no_repeat_ngram_size,
+ bad_words_ids=bad_words_ids,
+ pad_token_id=pad_token_id,
+ eos_token_id=eos_token_id,
+ batch_size=effective_batch_size,
+ vocab_size=vocab_size,
+ encoder_outputs=encoder_outputs,
+ attention_mask=attention_mask,
+ use_cache=use_cache,
+ tokenizer=tokenizer,
+ VERBOSE=VERBOSE,
+ probaMode=probaMode
+ )
return output
def do_hist(x, title=""):
if False:
# get number of values below 1000 (assume this is -inf)
res = tf.math.count_nonzero(tf.greater_equal(-1000, x))
x = x[tf.math.is_finite(x)]
# x = tf.where(x > - 500, x, -500)
num_bins = 20
# the histogram of the data
n, bins, patches = plt.hist(x, num_bins, facecolor='blue')
plt.xlabel('assigned proba')
plt.ylabel('occurence')
plt.title(r'{}: logistics distribution, # neglible tokens = {}'.format(title, res))
# Tweak spacing to prevent clipping of ylabel
plt.subplots_adjust(left=0.15)
plt.show()
+def getNewProba(probaMode, oldValue, nextValue, nb_gen=None):
+ if probaMode == "longOk":
+ newValue = float(np.power(
+ oldValue,
+ (nb_gen - 1.0) / nb_gen) * np.power(nextValue, 1.0 / nb_gen)
+ )
+ return newValue
+
+ elif probaMode == "mult":
+ return oldValue*nextValue
+
+ else:
+ NotImplementedError("This probability mode is not yet implemented")
+
print("def generate no beam search modif")
def _generate_no_beam_search_modif(
self,
input_ids,
cur_len,
max_length,
min_length,
do_sample,
temperature,
top_k,
top_p,
repetition_penalty,
no_repeat_ngram_size,
bad_words_ids,
pad_token_id,
eos_token_id,
batch_size,
vocab_size,
encoder_outputs,
attention_mask,
use_cache,
tokenizer,
VERBOSE,
+ probaMode,
**kwargs
):
"""
Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated
independantly.
"""
answers = []
scores = []
forbiden_first_token = []
context = tokenizer.decode(input_ids[0])
if VERBOSE == True:
print("context is : {}".format(context))
for beam_nb in range(3):
answer = ""
FIRST = True
EOS_1st = True
nb_gen = 0
output_score = 1
# length of generated sentences / unfinished sentences
unfinished_sents = tf.ones_like(input_ids[:, 0])
sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length
past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models
while cur_len < max_length:
nb_gen += 1
model_inputs = self.prepare_inputs_for_generation(
input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs
)
outputs = self(**model_inputs)
next_token_logits = outputs[0][:, -1, :]
do_hist(next_token_logits[0], "init")
# if model has past, then set the past variable to speed up decoding
if self._use_cache(outputs, use_cache):
past = outputs[1]
# keep it to test stability, but to get best performance don't use it
# repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)
if repetition_penalty != 1.0:
next_token_logits_penalties = ge._create_next_token_logits_penalties(
input_ids, next_token_logits, repetition_penalty
)
next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties)
logistics_copy = tf.identity(next_token_logits)
nb_fails = 0
while True:
# restore logistics, if loop is done several times
next_token_logits = tf.identity(logistics_copy)
if do_sample:
# Temperature (higher temperature => more likely to sample low probability tokens)
if temperature != 1.0:
next_token_logits = next_token_logits / temperature
do_hist(next_token_logits[0], "temp")
# Top-p/top-k filtering
print("topk = {}, top_p = {}".format(top_k, top_p))
next_token_logits = ge.tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
do_hist(next_token_logits[0], "top p/k")
# Sample
next_token = tf.squeeze(
tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1
)
else:
# Greedy decoding
next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32)
# is next token eos or in the sequence?
if VERBOSE == True:
print("The next token is {}".format(tokenizer.decode([int(next_token)])))
tmp_answer = answer + tokenizer.decode([int(next_token)])
forceReuse = True
if not forceReuse:
print("reuse not enforced")
break
remove_token = False
# if end is detected
if int(next_token) == eos_token_id:
if EOS_1st:
EOS_1st = False
probs = tf.nn.softmax(next_token_logits[0])
next_score = probs[int(next_token)]
if answer not in answers:
answers.append(answer)
if answers[-1] not in ["", "-"]:
- scores.append(float(
- np.power(output_score, (nb_gen - 1.0) / nb_gen) * np.power(next_score,
- 1.0 / nb_gen)))
+ scores.append(getNewProba(probaMode, output_score, next_score, nb_gen=nb_gen))
else:
scores.append(0.0)
if VERBOSE == True:
print("One result would be : {}, with proba {}".format(answers[-1], scores[-1]))
remove_token = True
else:
if answer not in answers:
answers.append(answer)
if answers[-1] not in ["", "-"]:
scores.append(float(output_score))
else:
scores.append(0.0)
if VERBOSE == True:
print("Another result would be : {}, with proba {}".format(answers[-1],
scores[-1]))
answer = tmp_answer
break
# if probabillity is low (ignoring current token), or only forbiden tokens are proposed
if output_score < 0.05 or nb_fails > 100:
if answer not in answers:
answers.append(answer)
if answers[-1] not in ["", "-"]:
scores.append(float(output_score))
else:
scores.append(0.0)
nb_fails = True
if True: # VERBOSE == True:
print(
"No point to continue searching in ... : {}, with proba {}".format(answers[-1],
scores[-1]))
break
# when predicting the first token, don't start multiple time at the same place
if FIRST:
FIRST = False
if tmp_answer[0] in forbiden_first_token:
if "nothing" not in VERBOSE:
print("it started with the same letter")
remove_token = True
else:
forbiden_first_token.append(tmp_answer[0])
# check that token is in input, and previous conditions are fullfulled (they would seet remove_token to True)
if tmp_answer in context and not remove_token:
answer = tmp_answer
break
# remove that token and search something else
else:
# remove that token as possibility
nb_fails += 1
if "nothing" not in VERBOSE:
print("this way the answer would not be in the context")
print("remove token {}".format(int(next_token)))
tmp = [[False for i in range(logistics_copy.shape[1])]]
tmp[0][int(next_token)] = True
logistics_copy = ge.set_tensor_by_indices_to_value(
logistics_copy, tf.convert_to_tensor(tmp, dtype=tf.bool), -float("inf")
)
probs = tf.nn.softmax(next_token_logits[0])
next_score = probs[int(next_token)]
- output_score = np.power(output_score, (nb_gen - 1.0) / nb_gen) * np.power(next_score, 1.0 / nb_gen)
+ output_score = getNewProba(probaMode, output_score, next_score, nb_gen=nb_gen)
if VERBOSE == True:
print("output_score = {}, next_score = {}".format(output_score, next_score))
# print("The banned tokens are \n {}".format(
# [tokenizer.decode([x]) for x in banned_tokens[0]]))
# update generations and finished sentences
if eos_token_id is not None:
# pad finished sentences if eos_token_id exist
tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
else:
tokens_to_add = next_token
# add token and increase length by one
input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1)
cur_len = cur_len + 1
if eos_token_id is not None:
eos_in_sents = tokens_to_add == eos_token_id
# if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply(
unfinished_sents, tf.cast(eos_in_sents, tf.int32)
)
sent_lengths = (
sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos)
+ cur_len * is_sents_unfinished_and_token_to_add_is_eos
)
# unfinished_sents is set to zero if eos in sentence
unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos
# stop when there is a in each sentence, or if we exceed the maximum length
if tf.math.reduce_max(unfinished_sents) == 0 or nb_fails == True:
break
# extend attention_mask for new generated input if only decoder
if self.config.is_encoder_decoder is False:
attention_mask = tf.concat(
[attention_mask, tf.ones((ge.shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1
)
# if there are different sentences lengths in the batch, some batches have to be padded
min_sent_length = tf.math.reduce_min(sent_lengths)
max_sent_length = tf.math.reduce_max(sent_lengths)
if min_sent_length != max_sent_length:
assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths"
# finished sents are filled with pad_token
padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id
# create length masks for tf.where operation
broad_casted_sent_lengths = tf.broadcast_to(
tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length]
)
broad_casted_range = tf.transpose(
tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size])
)
decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding)
else:
decoded = input_ids
if "score" in VERBOSE:
print(answers)
print(scores)
if "" in answers:
print(" -- ")
if '' in answers:
print(" ---- ")
# find most likely option
if len(scores) > 0:
max_ind = scores.index(max(scores))
return [answers[max_ind], scores[max_ind]]
else:
return ["None", 0]
return decoded
print("def calc banned ...")
def calc_banned_ngram_tokens_modif(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len):
# Copied from fairseq for no_repeat_ngram in beam_search
if cur_len + 1 < no_repeat_ngram_size:
# return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
return [[] for _ in range(num_hypos)]
generated_ngrams = [{} for _ in range(num_hypos)]
for idx in range(num_hypos):
gen_tokens = prev_input_ids[idx].numpy().tolist()
generated_ngram = generated_ngrams[idx]
for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
prev_ngram_tuple = tuple(ngram[:-1])
generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
def _get_generated_ngrams(hypo_idx):
# Before decoding the next token, prevent decoding of ngrams that have already appeared
start_idx = cur_len + 1 - no_repeat_ngram_size
ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist())
print("ngram_idx: {}".format(ngram_idx))
return generated_ngrams[hypo_idx].get(ngram_idx, [])
banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
return banned_tokens
\ No newline at end of file