diff --git a/main_project/gpt2_colab/load_files.py b/main_project/gpt2_colab/load_files.py index 1131c5d..f778cda 100644 --- a/main_project/gpt2_colab/load_files.py +++ b/main_project/gpt2_colab/load_files.py @@ -1,179 +1,179 @@ import os import json from abc import abstractmethod class Dataset: def __init__(self, home_path, finetune_path, data_path="./datasets/", split=0.9): self.xtrain = None self.ytrain = None self.xeval = None self.yeval = None self.xtest = None self.ytest = None self.data_path = data_path self.loc_data_path = None self.finetune_path = finetune_path self.home_path = home_path self.split = split # Private methods # protected functions def _openJSON(self, dataset, end="", ext=".json"): with open(dataset + end + ext, 'r') as fp: tmp = json.load(fp) return tmp def _writeTXT(self, data, name): with open(name + ".txt", 'w') as f: f.write(data) def _writeJSON(self, data, name): with open(name + ".json", "w") as f: json.dump(data, f) @abstractmethod def _format_data(self, end): raise NotImplementedError("format_data has to be defined in each child class") # Public methods def load_data(self, end="", dir=None): os.chdir(self.home_path) os.chdir(self.data_path) if dir is not None: os.chdir("./" + dir + "/") self.loc_data_path = os.getcwd() self.xtest = self._openJSON('x_test', end=end) self.ytest = self._openJSON('y_test', end=end) self.xtrain = self._openJSON('x_train', end=end) self.ytrain = self._openJSON('y_train', end=end) # split train in evaluation and train idx_split = int(self.split*len(self.xtrain)) self.xeval = self.xtrain[idx_split:] self.yeval = self.ytrain[idx_split:] self.xtrain = self.xtrain[:idx_split] self.ytrain = self.ytrain[:idx_split] self._format_data(end) def get_train(self): return [self.xtrain, self.ytrain] def get_eval(self): return [self.xeval, self.yeval] def get_test(self): return [self.xtest, self.ytest] @abstractmethod def print_example(self, set): raise NotImplementedError("format_data has to be defined in each child class") class DatasetBert(Dataset): def __init__(self, home_path, data_path="./datasets/", split=0.9): os.chdir(home_path) os.chdir("./transformers/examples/") os.chdir("./question-answering") finetune_path = os.getcwd() super().__init__(home_path, finetune_path, data_path, split) # private functions # protected functions def _format_data(self, end): # extract the context, question and answer from the x and y set def get_qca(xdata, ydata, i_qca): i_len = len("") a_len = len("") f_len = len("") e_len = len("<|endoftext|>") f_start = xdata[i_qca].find("") context = xdata[i_qca][i_len:f_start] question = xdata[i_qca][f_start + f_len:-a_len] answer = ydata[i_qca][:-e_len - 1] return context, question, answer # generate a dictionary of questions def gen_qa_set(xdata, ydata): qa_set = {"data": {"question": [], "context": [], "answers": []}} for i, _ in enumerate(xdata): context, question, answer = get_qca(xdata, ydata, i) qa_set["data"]["question"].append(question) qa_set["data"]["context"].append(context) qa_set["data"]["answers"].append({"answer_start": [context.find(answer)], "text": [answer]}) return qa_set # generate the train and validation datasets qa_train = gen_qa_set(self.xtrain, self.ytrain) qa_val = gen_qa_set(self.xeval, self.yeval) self._writeJSON(qa_train, "train" + end) self._writeJSON(qa_val, "eval" + end) + # modify the set structure + def add_sep(x, y): + for i, _ in enumerate(x): + q, c, a = get_qca(x, y, i) + x[i] = q + "---" + c + y[i] = a + + add_sep(self.xtrain, self.ytrain) + add_sep(self.xeval, self.yeval) + add_sep(self.xtest, self.ytest) + self.print_example(qa_train) # public functions def print_example(self, set): print("Questions:") print(set["data"]["question"][:3]) print("Context:") print(set["data"]["context"][:3]) print("Answers:") print(set["data"]["answers"][:3]) class DatasetGPT(Dataset): def __init__(self, home_path, data_path="./datasets/", split=0.9): os.chdir(home_path) os.chdir("./transformers/examples/") os.chdir("./language-modeling") finetune_path = os.getcwd() super().__init__(home_path, finetune_path, data_path, split) # private functions # protected functions - def _format_data(self, x_test, y_test, x_train, y_train, end): + def _format_data(self, end): # combine the list entries to a single long string comb_string = ["", ""] # train and eval set for ind, data in enumerate([[self.xtrain, self.ytrain], [self.xeval, self.yeval]]): for i, _ in enumerate(data): comb_string[ind] += data[0][i] # from x set comb_string[ind] += data[1][i] # from y set # save the training and evaluation files os.chdir(self.finetune_path) self._writeTXT(comb_string[0], "train" + end) self._writeTXT(comb_string[1], "eval" + end) # print an example self.print_example(comb_string[0]) - # modify the set structure - def add_sep(x, y): - for i, _ in enumerate(x): - q, c, a = get_qca(x, y, i) - x[i] = q + "---" + c - y[i] = a - - add_sep(self.xtrain, self.ytrain) - add_sep(self.xeval, self.yeval) - add_sep(self.xtest, self.ytest) - # public functions def print_example(self, set): print(set[:5000]) \ No newline at end of file diff --git a/main_project/gpt2_colab/main.py b/main_project/gpt2_colab/main.py index 9e16cba..a336c4c 100644 --- a/main_project/gpt2_colab/main.py +++ b/main_project/gpt2_colab/main.py @@ -1,368 +1,387 @@ import os import json from transformers.generation_tf_utils import TFGenerationMixin import numpy as np import tensorflow as tf import torch import load_files as lf import model as mo # choose the model type -MODEL = "bert" # gpt, bert +MODEL = "gpt" # gpt, bert # train the model from scratch or from a checkpoint SCRATCH = False # True, False # choose the checkpoint -CHECKPOINT = "xlm-roberta-base" # xlm-roberta-base, gpt2, None +CHECKPOINT = "gpt2" # xlm-roberta-base, gpt2, None # train the model in this run -TRAIN = True # True, False +TRAIN = False # True, False # number of training epochs EPOCHS = 1 # new model name -NAME = "xlmr-e1-test" +NAME = "gpt_e_1" # xmlr_e_1_test, home_path = os.getcwd() # load a dataset -if MODEL == "gpt2": +if MODEL == "gpt": print("Start instances for a gpt2 model") dataset = lf.DatasetGPT(home_path) - model = mo.GPTModel(home_path) + model = mo.GPTModel(home_path, printStep=-1) elif MODEL == "bert": print("Start instances for a bert model") dataset = lf.DatasetBert(home_path) - model = mo.BertModel(home_path) + model = mo.BertModel(home_path, printStep=-1) else: NotImplementedError("Model type not defined") # load a dataset dataset.load_data() # train a tokenizer from scratch if SCRATCH: NotImplementedError("Train a tokenizer") SCRATCH = "loc of tok" else: SCRATCH = None if TRAIN: model.train(nbEpochs=EPOCHS, outModelName=NAME, startCheckpoint=CHECKPOINT, dataEnd="", tokenizerLocaction=SCRATCH) # load the model model.load_model(NAME) +model.generate(dataset.get_test(), 0, 100) + +print("{} Test on CH dataset {}".format("="*100, "="*100)) +dataset.load_data(dir="ch_full") +model.generate(dataset.get_test(), 0, 100) + +print("{} Test on FR dataset {}".format("="*100, "="*100)) +dataset.load_data(dir="fr_full_surname", end="_fr") +model.generate(dataset.get_test(), 0, 100) + +print("{} Test on FI dataset {}".format("="*100, "="*100)) +dataset.load_data(dir="fi_full_surname", end="_fi") +model.generate(dataset.get_test(), 0, 100) + +print("{} Test on FI dataset with Finnish names {}".format("="*100, "="*100)) +dataset.load_data(dir="fi_full_surname_fi_names", end="_fi") +model.generate(dataset.get_test(), 0, 100) + + """ # modify the hugginface code TFGenerationMixin._generate_no_beam_search = mod_gpt._generate_no_beam_search_modif TFGenerationMixin.generate = mod_gpt.generate_modif def calc_perf(lim, is_lim, corr_answer, given_answer, corr2): for i, l in enumerate(lim): # calculate true positive, ect if is_lim > l: # confidence over treshold if corr_answer == given_answer: corr2[i,0] += 1 else: corr2[i,1] += 1 else: if corr_answer == given_answer: corr2[i,2] += 1 else: corr2[i,3] += 1 return corr2 def print_perf(corr2, lim, console_out, samples): print(samples) print(corr2[0]) # true positive ect table tmp = "lim - true pos - false pos - false neg - true neg\n" for i, l in enumerate(lim): tmp += str(l) + " - " for v in corr2[i]: tmp += str(int(100 / samples * v)) + "% - " # tmp -= "- " tmp += "\n" print(tmp) console_out += tmp + "\n" # true positive ect table tmp = "lim - F1 - recall - precision \n" for i, l in enumerate(lim): tmp += str(l) + " - " recall = corr2[i,0] / (corr2[i,0] + corr2[i,2]) precision = corr2[i,0] / (corr2[i,0] + corr2[i,1]) F1 = 2* precision * recall / (precision + recall) tmp += str(F1) + " - " tmp += str(recall) + " - " tmp += str(precision) tmp += "\n" print(tmp) console_out += tmp + "\n" return console_out def score(x, y, name="model_small", step=10, stop=100, console_name=""): console_out = "" samples = 0 samples_c = 0 samples_pc = 0 x_sure = [] y_sure = [] x_unsure = [] y_unsure = [] eps = 0.01 lim = [.1, .2, .3, .4, .5, .6, .7, .8, .9] count = [eps, eps, eps, eps, eps, eps, eps, eps, eps] corr = [0, 0, 0, 0, 0, 0, 0, 0, 0] corr2 = np.zeros((len(lim), 4)) if MODEL == "gpt2": model = mo.get_model(model_path, name) if SCRATCH: tokenizer = tok.get_tok(tok_path) else: tokenizer = GPT2Tokenizer.from_pretrained("gpt2") for ind, start_ in enumerate(x): if ind > stop: break input_ids = tokenizer.encode(start_, return_tensors='tf') generated_text_samples = model.generate( input_ids, max_length=len(input_ids[0]) + 50, num_return_sequences=1, no_repeat_ngram_size=0, repetition_penalty=1.0, top_p=1.0, temperature=1.0, do_sample=False, top_k=0, early_stopping=True, tokenizer=tokenizer, VERBOSE=VERBOSE ) corr_answer = y[ind][:-len("<|endoftext|>") - 1] given_answer = generated_text_samples[0] # answer correct with high proba if generated_text_samples[1] > 0.8: x_sure.append(start_) y_sure.append(given_answer + "<|endoftext|>") else: x_unsure.append(start_) y_unsure.append("<|endoftext|>") os.chdir(curr_data_path) for data_set in ["train", "test"]: with open("x_"+data_set+"_sure.json", 'w') as fp: json.dump(x_sure, fp) with open("y_"+data_set+"_sure.json", 'w') as fp: json.dump(y_sure, fp) with open("x_"+data_set+"_unsure.json", 'w') as fp: json.dump(x_unsure, fp) with open("y_"+data_set+"_unsure.json", 'w') as fp: json.dump(y_unsure, fp) samples += 1 for i, l in enumerate(lim): if generated_text_samples[1] > l: if count[i] > eps: count[i] += 1 else: count[i] = 1 corr2 = calc_perf(lim, generated_text_samples[1], corr_answer, given_answer, corr2) if corr_answer == given_answer: samples_c += 1 samples_pc += 1 for i, l in enumerate(lim): if generated_text_samples[1] > l: corr[i] += 1 elif given_answer in corr_answer: samples_pc += 1 if ind % step == 0 or ind == stop: tmp="Score: {}, {}, {}".format(samples, samples_c, samples_pc) print(tmp) console_out += tmp + "\n" tmp="lim : {} \n prob: {} \n used: {}".format(lim, [int(100 / count[i] * corr[i]) for i, _ in enumerate(lim)], [int(100 / samples * count[i]) for i, _ in enumerate(lim)]) print(tmp) console_out += tmp + "\n" tmp = "{} / {} ".format(corr_answer, given_answer) print(tmp) console_out += tmp + "\n" console_out = print_perf(corr2, lim, console_out, samples) os.chdir(curr_data_path) with open("console_out" + console_name + ".txt", 'w') as f: f.write(console_out) # create a new dataset with good answers return samples, samples_c, samples_pc def score_xmlr(x, y, name="model_small", step=10, stop=100, console_name=""): console_out = "" samples = 0 samples_c = 0 samples_pc = 0 eps = 0.01 lim = [x/10 for x in range(10)] # lim = [.1, .2, .3, .4, .5, .6, .7, .8, .9] # count = np.zeros((len(lim),)) # count += eps # count = [eps, eps, eps, eps, eps, eps, eps, eps, eps] corr2 = np.zeros((len(lim), 4)) # tmp = [0, 0, 0, 0] # corr2 = [tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp] if MODEL == "xmlr": model, tokenizer = mox.get_model(model_path, name) for ind, start_ in enumerate(x): if ind > stop: break tmp = x[ind].split("---") text = tmp[1] question = tmp[0] corr_answer = y[ind] inputs = tokenizer(question, text, return_tensors='pt') input_ids = inputs["input_ids"] attention_mask = inputs["attention_mask"] outputs = model(**inputs) loss = outputs.loss start_scores = outputs.start_logits end_scores = outputs.end_logits a = torch.argmax(start_scores) b = torch.argmax(end_scores) a = int(a) b = int(b) probs_a = tf.nn.softmax(start_scores.detach()) probs_b = tf.nn.softmax(end_scores.detach()) prob_a = probs_a[0, a] prob_b = probs_b[0, b] prob_ab = prob_a*prob_b print("a = {} with {} %, b = {} with {} %, combo {}".format(a, prob_a, b, prob_b, prob_ab)) # answer => 'a nice puppet' print("correct answer: " + corr_answer) given_answer = tokenizer.decode(inputs['input_ids'][0][int(a):int(b) + 1]) print("given answer: " + given_answer) samples += 1 corr2 = calc_perf(lim, prob_ab, corr_answer, given_answer, corr2) if given_answer[0] == " ": given_answer = given_answer[1:] if corr_answer == given_answer: samples_c += 1 samples_pc += 1 elif given_answer in corr_answer: samples_pc += 1 if ind % step == 0 or ind == stop: tmp = "Score: {}, {}, {}".format(samples, samples_c, samples_pc) print(tmp) console_out += tmp + "\n" tmp = "{} / {} ".format(corr_answer, given_answer) print(tmp) console_out += tmp + "\n" console_out = print_perf(corr2, lim, console_out, samples) os.chdir(curr_data_path) with open("console_out" + console_name + ".txt", 'w') as f: f.write(console_out) # create a new dataset with good answers return samples, samples_c, samples_pc if False: score_xmlr(x_test, y_test, name=name, step=10, stop=100) if True: VERBOSE = "nothing_but_score" score(x_test, y_test, name=name, step=10, stop=100) if False: x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, dir="ch_full") score(x_test, y_test, model=name, step=1, stop=1) x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, end="_fr", dir="fr_full_surname") score(x_test, y_test, model=name, step=1, stop=1) x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, end="_fi", dir="fi_full_surname") score(x_test, y_test, name=name, step=5, stop=100, console_name="_on_ch") name = "gpt_e_1_adapt" start = "gpt_e_1" _, _, _, _, _ = lf.load_data(home_path, gpt_path, end="_sure", dir="fi_full_surname") os.chdir(gpt_path) mo.train(model_path, 2, name, end="_sure", start=start, tok_loc = tok_loc) score(x_test[100:], y_test[100:], name=name, step=5, stop=100, console_name="_on_ch_fi") name = "gpt_e_1_adapt2" start = "gpt_e_1_adapt" _, _, _, _, _ = lf.load_data(home_path, gpt_path, end="_sure", dir="fi_full_surname") os.chdir(gpt_path) mo.train(model_path, 2, name, end="_sure", start=start, tok_loc = tok_loc) score(x_test[200:], y_test[200:], name=name, step=5, stop=100, console_name="_on_ch_fi_fi") """ \ No newline at end of file diff --git a/main_project/gpt2_colab/model.py b/main_project/gpt2_colab/model.py index c441efa..b35be0f 100644 --- a/main_project/gpt2_colab/model.py +++ b/main_project/gpt2_colab/model.py @@ -1,121 +1,358 @@ import os from transformers import TFGPT2LMHeadModel from transformers import AutoTokenizer, AutoModelForQuestionAnswering from abc import abstractmethod +import numpy as np +import tensorflow as tf +import torch + +from transformers.generation_tf_utils import TFGenerationMixin +import modif_gpt as mod_gpt + + class Model: - def __init__(self, home_path, finetune_path, model_path="./models/"): + def __init__(self, home_path, finetune_path, model_path, printStep, buckets): + os.chdir(home_path) + os.chdir(model_path) + model_path = os.getcwd() + + self.home_path = home_path self.home_path = home_path self.finetune_path = finetune_path self.model_path = model_path self.model = None self.tokenizer = None + self.buckets = buckets + self.lim = [x/self.buckets for x in range(self.buckets)] + self.count = np.zeros((self.buckets,)) + self.score = { + "tp": np.zeros((self.buckets,)), + "fp": np.zeros((self.buckets,)), + "fn": np.zeros((self.buckets,)), + "tn": np.zeros((self.buckets,)), + + "f1": np.zeros((self.buckets,)), + "recall": np.zeros((self.buckets,)), + "precision": np.zeros((self.buckets,)), + } + + self.printStep = printStep + + self.sureLim = 0.8 + self.xsure = [] + self.ysure = [] + # private functions + def __printScores(self, hist=True): + def nbTolines(nb1, nb2): + return std_len("|"*int(100/nb2*nb1), l=100, side="right", fill=".") + + def std_len(s, l=8, side="left", fill=" "): + s = str(s) + while(len(s) < l): + if side == "left": + s = fill + s + else: + s += fill + return s + + def doHist(tmp, var): + if i == self.buckets - 1: + next_bin = 0 + else: + next_bin = var[i + 1] + tmp += " - {}".format( + nbTolines(var[i]-next_bin, var[0]) + ) + return tmp + + print("Number of exact matches, for each ConfidenceScore threshold") + print("trs - used cases [%] - ConfidenceScore distribution") + tmp = "" + for i in range(self.buckets): + tmp += "{} - {}%".format( + std_len(self.lim[i]), + std_len(int(100/self.count[0]*self.count[i])) + ) + if hist: + tmp = doHist(tmp, self.count) + tmp += "\n" + + print(tmp) + print("\n {} \n".format("="*100)) + + print("trs - true positive [%] - false positive [%] - false negative [%] - true negative [%]") + tmp = "" + for i in range(self.buckets): + tmp += "{} - {}% - {}% - {}% - {}%\n".format( + std_len(self.lim[i]), + std_len(int(100/self.count[i]*self.score["tp"][i])), + std_len(int(100 / self.count[i] * self.score["fp"][i])), + std_len(int(100 / self.count[i] * self.score["fn"][i])), + std_len(int(100 / self.count[i] * self.score["tn"][i])) + ) + print(tmp) + print("\n {} \n".format("="*100)) + + print("trs - recall - precision - F1 - F1 histogram") + tmp = "" + for i in range(self.buckets): + tmp += "{} - {} - {} - {}".format( + std_len(self.lim[i], 2), + std_len(round(self.score["recall"][i], 2)), + std_len(round(self.score["precision"][i], 2)), + std_len(round(self.score["f1"][i], 2)) + ) + if hist: + tmp += " - " + nbTolines(self.score["f1"][i], 1) + tmp += "\n" + + print(tmp) + print("\n {} \n".format("=" * 100)) + + def __updateF1(self): + for i in range(self.buckets): + self.score["recall"][i] = \ + self.score["tp"][i]/(self.score["tp"][i] + self.score["fn"][i]) + self.score["precision"][i] = \ + self.score["tp"][i]/(self.score["tp"][i] + self.score["fp"][i]) + + self.score["f1"][i] = \ + 2 * self.score["precision"][i]*self.score["recall"][i] / \ + (self.score["precision"][i] + self.score["recall"][i]) + + def __check_answer(self, confidenceScore, corrAnswer, givenAnswer): + for i, l in enumerate(self.lim): + # calculate true positive, ect + if confidenceScore > l: + self.count[i] += 1 + + if corrAnswer == givenAnswer: + self.score["tp"][i] += 1 + else: + self.score["fp"][i] += 1 + else: + if corrAnswer == givenAnswer: + self.score["fn"][i] += 1 + else: + self.score["tn"][i] += 1 + + # update recall, precission and F1 score + self.__updateF1() # protected functions @abstractmethod def _train(self, nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction): raise NotImplementedError("__train has to be defined in each child class") + @abstractmethod + def _generate(self, x): + raise NotImplementedError("_generate has to be defined in each child class") + + # public functions + def getSureGuesses(self): + return [self.xsure, self.ysure] + def train(self, nbEpochs, outModelName, startCheckpoint=None, dataEnd="", tokenizerLocaction=None): print("train ...") if startCheckpoint == None: print("... from scratch, with the Tokenizer in {}".format(tokenizerLocaction)) + raise NotImplementedError("Training from Scratch is not implemented yet") else: tokenizerLocaction = startCheckpoint cmd = self._train(nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction) print(cmd) os.chdir(self.finetune_path) os.system(cmd) + def generate(self, data, idxStart=0, idx_End=None): + if idx_End is None: + idx_End = len(data[0]) + X = data[0][idxStart:idx_End] + Y = data[1][idxStart:idx_End] + + for i, x in enumerate(X): + givenAnswer, confScore = self._generate(x) + corrAnswer = Y[i] + + self.__check_answer(confScore, corrAnswer, givenAnswer) + + if confScore > self.sureLim: + self.xsure.append(x) + self.ysure.append(Y[i]) + + if self.printStep > 0 and (i+1) % self.printStep == 0: + print("({}): Correct Answer / given Answer \n{} / {}".format( + i, corrAnswer, givenAnswer + )) + self.__printScores() + + print("Final scores") + self.__printScores() + @abstractmethod def load_model(self, ModelName): raise NotImplementedError("load_model has to be defined in each child class") class GPTModel(Model): - def __init__(self, home_path, model_path="./models/"): + def __init__(self, home_path, model_path="./models", probaMode = "longOk", printStep=10, buckets=10): os.chdir(home_path) os.chdir("./transformers/examples/") os.chdir("./language-modeling") finetune_path = os.getcwd() - super().__init__(home_path, finetune_path, model_path) + super().__init__(home_path, finetune_path, model_path, printStep, buckets) + + self.probaMode = probaMode def _train(self, nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction): cmd = "python run_clm.py \ --model_type {} \ --train_file \"{}\" \ --tokenizer_name {} \ --do_train \ --validation_file \"{}\" \ --do_eval \ --per_gpu_train_batch_size 1 \ --save_steps -1 \ --num_train_epochs \"{}\" \ --fp16 \ --output_dir=\"{}\" \ ".format( "gpt2", "train" + dataEnd + ".txt", tokenizerLocaction, "eval" + dataEnd + ".txt", nbEpochs, self.model_path + "/" + outModelName) if startCheckpoint is not None: if startCheckpoint not in ["gpt2"]: startCheckpoint = self.model_path + "/" + startCheckpoint cmd += " --model_name_or_path {}".format(startCheckpoint) return cmd + def __set_modif_gpt(self): + TFGenerationMixin._generate_no_beam_search = mod_gpt._generate_no_beam_search_modif + TFGenerationMixin.generate = mod_gpt.generate_modif + + def _generate(self, x): + self.__set_modif_gpt() + input_ids = self.tokenizer.encode(x, return_tensors='tf') + + VERBOSE = "nothing_but_score" + VERBOSE = "nothing" + + generated_text_samples = self.model.generate( + input_ids, + max_length=len(input_ids[0]) + 50, + num_return_sequences=1, + no_repeat_ngram_size=0, + repetition_penalty=1.0, + top_p=1.0, + temperature=1.0, + do_sample=False, + top_k=0, + early_stopping=True, + tokenizer=self.tokenizer, + VERBOSE=VERBOSE, + probaMode=self.probaMode + ) + + givenAnswer = generated_text_samples[0] + + return givenAnswer, generated_text_samples[1] + + def set_proba_mode(self, mode): + if mode in ["mult", "longOk"]: + self.probaMode = mode + else: + raise NotImplementedError("This probability mode is not yet implemented") + def load_model(self, ModelName): - self.model = TFGPT2LMHeadModel.from_pretrained(model_path + "/" + name, from_pt=True) - self.tokenizer = AutoTokenizer.from_pretrained(model_path + "/" + name) + self.model = TFGPT2LMHeadModel.from_pretrained(self.model_path + "/" + ModelName, from_pt=True) + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path + "/" + ModelName) class BertModel(Model): - def __init__(self, home_path, model_path="./models/"): + def __init__(self, home_path, model_path="./models", printStep=10, buckets=10): os.chdir(home_path) os.chdir("./transformers/examples/") os.chdir("./question-answering") finetune_path = os.getcwd() - super().__init__(home_path, finetune_path, model_path) + super().__init__(home_path, finetune_path, model_path, printStep, buckets) # private functions def _train(self, nbEpochs, outModelName, startCheckpoint, dataEnd, tokenizerLocaction): cmd = "python run_qa.py \ --train_file \"{}\" \ --do_train \ --num_train_epochs \"{}\" \ --output_dir=\"{}\" \ --fp16 \ --save_steps -1 \ ".format( "train" + dataEnd + ".json", nbEpochs, self.model_path + "/" + outModelName) # --validation_file \"{}\" \ # --do_eval \ # "eval" + end + ".json" if startCheckpoint is not None: if startCheckpoint not in ["xlm-roberta-base", "roberta-base"]: startCheckpoint = self.model_path + "/" + startCheckpoint cmd += " --model_name_or_path=\"{}\"".format(startCheckpoint) return cmd + def _generate(self, x): + tmp = x.split("---") + text = tmp[1] + question = tmp[0] + + # tokenize model input + inputs = self.tokenizer(question, text, return_tensors='pt') + + # generate network output + outputs = self.model(**inputs) + start_scores = outputs.start_logits + end_scores = outputs.end_logits + + # find start and end of the answer + a = torch.argmax(start_scores) + b = torch.argmax(end_scores) + a = int(a) + b = int(b) + + # get the probability of the answer + probs_a = tf.nn.softmax(start_scores.detach()) + probs_b = tf.nn.softmax(end_scores.detach()) + + prob_a = probs_a[0, a] + prob_b = probs_b[0, b] + prob_ab = prob_a * prob_b + + givenAnswer = self.tokenizer.decode(inputs['input_ids'][0][int(a):int(b) + 1]) + + # due to the tokenizer the answer often starts with a blank space, which is not part of the answer + if len(givenAnswer) > 0 and givenAnswer[0] == " ": + givenAnswer = givenAnswer[1:] + + return givenAnswer, prob_ab + # public functions def load_model(self, ModelName): self.model = AutoModelForQuestionAnswering.from_pretrained(self.model_path+"/"+ModelName) self.tokenizer = AutoTokenizer.from_pretrained(self.model_path + "/" + ModelName) \ No newline at end of file diff --git a/main_project/gpt2_colab/modif_gpt.py b/main_project/gpt2_colab/modif_gpt.py index 9c0aa24..3d90ad5 100644 --- a/main_project/gpt2_colab/modif_gpt.py +++ b/main_project/gpt2_colab/modif_gpt.py @@ -1,571 +1,558 @@ import matplotlib.pyplot as plt import transformers.generation_tf_utils as ge import numpy as np import tensorflow as tf def generate_modif( self, input_ids=None, max_length=None, min_length=None, do_sample=None, early_stopping=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, bad_words_ids=None, bos_token_id=None, pad_token_id=None, eos_token_id=None, length_penalty=None, no_repeat_ngram_size=None, num_return_sequences=None, attention_mask=None, decoder_start_token_id=None, use_cache=None, forced_bos_token_id=None, forced_eos_token_id=None, tokenizer=None, VERBOSE=None, + probaMode=None, ): # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError( "You tried to generate sequences with a model that does not have a LM Head." "Please use another model class (e.g. `TFOpenAIGPTLMHeadModel`, `TFXLNetLMHeadModel`, `TFGPT2LMHeadModel`, `TFCTRLLMHeadModel`, `TFT5ForConditionalGeneration`, `TFTransfoXLLMHeadModel`)" ) max_length = max_length if max_length is not None else self.config.max_length min_length = min_length if min_length is not None else self.config.min_length do_sample = do_sample if do_sample is not None else self.config.do_sample early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping num_beams = num_beams if num_beams is not None else self.config.num_beams temperature = temperature if temperature is not None else self.config.temperature top_k = top_k if top_k is not None else self.config.top_k top_p = top_p if top_p is not None else self.config.top_p repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty no_repeat_ngram_size = ( no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size ) bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids num_return_sequences = ( num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences ) decoder_start_token_id = ( decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id ) forced_bos_token_id = ( forced_bos_token_id if forced_bos_token_id is not None else self.config.forced_bos_token_id ) forced_eos_token_id = ( forced_eos_token_id if forced_eos_token_id is not None else self.config.forced_eos_token_id ) if input_ids is not None: batch_size = ge.shape_list(input_ids)[0] # overridden by the input batch_size else: batch_size = 1 assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer." assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer." assert temperature > 0, "`temperature` should be strictly positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert input_ids is not None or ( isinstance(bos_token_id, int) and bos_token_id >= 0 ), "If input_ids is not defined, `bos_token_id` should be a positive integer." assert pad_token_id is None or ( isinstance(pad_token_id, int) and (pad_token_id >= 0) ), "`pad_token_id` should be a positive integer." assert (eos_token_id is None) or ( isinstance(eos_token_id, int) and (eos_token_id >= 0) ), "`eos_token_id` should be a positive integer." assert length_penalty > 0, "`length_penalty` should be strictly positive." assert ( isinstance(num_return_sequences, int) and num_return_sequences > 0 ), "`num_return_sequences` should be a strictly positive integer." assert ( bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list) ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated" if input_ids is None: assert isinstance(bos_token_id, int) and bos_token_id >= 0, ( "you should either supply a context to complete as `input_ids` input " "or a `bos_token_id` (integer >= 0) as a first token to start the generation." ) input_ids = tf.fill((batch_size, 1), bos_token_id) else: assert len(ge.shape_list(input_ids)) == 2, "Input prompt should be of shape (batch_size, sequence length)." # not allow to duplicate outputs when greedy decoding if do_sample is False: if num_beams == 1: # no_beam_search greedy generation conditions assert ( num_return_sequences == 1 ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1" else: # beam_search greedy generation conditions assert ( num_beams >= num_return_sequences ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences" # create attention mask if necessary # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140 if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids.numpy()): attention_mask = tf.cast(tf.math.not_equal(input_ids, pad_token_id), dtype=tf.int32) elif attention_mask is None: attention_mask = tf.ones_like(input_ids) if pad_token_id is None and eos_token_id is not None: ge.logger.warning( "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id) ) pad_token_id = eos_token_id # current position and vocab size cur_len = ge.shape_list(input_ids)[1] # unused vocab_size = self.config.vocab_size # set effective batch size and effective batch multiplier according to do_sample if do_sample: effective_batch_size = batch_size * num_return_sequences effective_batch_mult = num_return_sequences else: effective_batch_size = batch_size effective_batch_mult = 1 if self.config.is_encoder_decoder: if decoder_start_token_id is None: decoder_start_token_id = bos_token_id assert ( decoder_start_token_id is not None ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation" assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self) assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder) # get encoder and store encoder outputs encoder = self.get_encoder() encoder_outputs = encoder(input_ids, attention_mask=attention_mask) # Expand input ids if num_beams > 1 or num_return_sequences > 1 if num_return_sequences > 1 or num_beams > 1: input_ids_len = ge.shape_list(input_ids)[-1] input_ids = tf.broadcast_to( tf.expand_dims(input_ids, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) attention_mask = tf.broadcast_to( tf.expand_dims(attention_mask, 1), (batch_size, effective_batch_mult * num_beams, input_ids_len) ) input_ids = tf.reshape( input_ids, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) attention_mask = tf.reshape( attention_mask, (effective_batch_size * num_beams, input_ids_len) ) # shape: (batch_size * num_return_sequences * num_beams, cur_len) if self.config.is_encoder_decoder: # create empty decoder_input_ids input_ids = ( tf.ones( (effective_batch_size * num_beams, 1), dtype=tf.int32, ) * decoder_start_token_id ) cur_len = 1 assert ( batch_size == encoder_outputs[0].shape[0] ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} " # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1) expanded_batch_idxs = tf.reshape( tf.repeat(tf.expand_dims(tf.range(batch_size), -1), repeats=num_beams * effective_batch_mult, axis=1), shape=(-1,), ) # expand encoder_outputs encoder_outputs = (tf.gather(encoder_outputs[0], expanded_batch_idxs, axis=0),) else: encoder_outputs = None cur_len = ge.shape_list(input_ids)[-1] assert ( cur_len < max_length ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`" - if num_beams > 1: - output = self._generate_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - early_stopping=early_stopping, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - num_return_sequences=num_return_sequences, - length_penalty=length_penalty, - num_beams=num_beams, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - forced_bos_token_id=forced_bos_token_id, - forced_eos_token_id=forced_eos_token_id, - ) - else: - output = self._generate_no_beam_search( - input_ids, - cur_len=cur_len, - max_length=max_length, - min_length=min_length, - do_sample=do_sample, - temperature=temperature, - top_k=top_k, - top_p=top_p, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=no_repeat_ngram_size, - bad_words_ids=bad_words_ids, - pad_token_id=pad_token_id, - eos_token_id=eos_token_id, - batch_size=effective_batch_size, - vocab_size=vocab_size, - encoder_outputs=encoder_outputs, - attention_mask=attention_mask, - use_cache=use_cache, - tokenizer=tokenizer, - VERBOSE=VERBOSE, - ) + output = self._generate_no_beam_search( + input_ids, + cur_len=cur_len, + max_length=max_length, + min_length=min_length, + do_sample=do_sample, + temperature=temperature, + top_k=top_k, + top_p=top_p, + repetition_penalty=repetition_penalty, + no_repeat_ngram_size=no_repeat_ngram_size, + bad_words_ids=bad_words_ids, + pad_token_id=pad_token_id, + eos_token_id=eos_token_id, + batch_size=effective_batch_size, + vocab_size=vocab_size, + encoder_outputs=encoder_outputs, + attention_mask=attention_mask, + use_cache=use_cache, + tokenizer=tokenizer, + VERBOSE=VERBOSE, + probaMode=probaMode + ) return output def do_hist(x, title=""): if False: # get number of values below 1000 (assume this is -inf) res = tf.math.count_nonzero(tf.greater_equal(-1000, x)) x = x[tf.math.is_finite(x)] # x = tf.where(x > - 500, x, -500) num_bins = 20 # the histogram of the data n, bins, patches = plt.hist(x, num_bins, facecolor='blue') plt.xlabel('assigned proba') plt.ylabel('occurence') plt.title(r'{}: logistics distribution, # neglible tokens = {}'.format(title, res)) # Tweak spacing to prevent clipping of ylabel plt.subplots_adjust(left=0.15) plt.show() +def getNewProba(probaMode, oldValue, nextValue, nb_gen=None): + if probaMode == "longOk": + newValue = float(np.power( + oldValue, + (nb_gen - 1.0) / nb_gen) * np.power(nextValue, 1.0 / nb_gen) + ) + return newValue + + elif probaMode == "mult": + return oldValue*nextValue + + else: + NotImplementedError("This probability mode is not yet implemented") + print("def generate no beam search modif") def _generate_no_beam_search_modif( self, input_ids, cur_len, max_length, min_length, do_sample, temperature, top_k, top_p, repetition_penalty, no_repeat_ngram_size, bad_words_ids, pad_token_id, eos_token_id, batch_size, vocab_size, encoder_outputs, attention_mask, use_cache, tokenizer, VERBOSE, + probaMode, **kwargs ): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ answers = [] scores = [] forbiden_first_token = [] context = tokenizer.decode(input_ids[0]) if VERBOSE == True: print("context is : {}".format(context)) for beam_nb in range(3): answer = "" FIRST = True EOS_1st = True nb_gen = 0 output_score = 1 # length of generated sentences / unfinished sentences unfinished_sents = tf.ones_like(input_ids[:, 0]) sent_lengths = tf.ones_like(input_ids[:, 0]) * max_length past = encoder_outputs # defined for encoder-decoder models, None for decoder-only models while cur_len < max_length: nb_gen += 1 model_inputs = self.prepare_inputs_for_generation( input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **kwargs ) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] do_hist(next_token_logits[0], "init") # if model has past, then set the past variable to speed up decoding if self._use_cache(outputs, use_cache): past = outputs[1] # keep it to test stability, but to get best performance don't use it # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: next_token_logits_penalties = ge._create_next_token_logits_penalties( input_ids, next_token_logits, repetition_penalty ) next_token_logits = tf.math.multiply(next_token_logits, next_token_logits_penalties) logistics_copy = tf.identity(next_token_logits) nb_fails = 0 while True: # restore logistics, if loop is done several times next_token_logits = tf.identity(logistics_copy) if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) if temperature != 1.0: next_token_logits = next_token_logits / temperature do_hist(next_token_logits[0], "temp") # Top-p/top-k filtering print("topk = {}, top_p = {}".format(top_k, top_p)) next_token_logits = ge.tf_top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) do_hist(next_token_logits[0], "top p/k") # Sample next_token = tf.squeeze( tf.random.categorical(next_token_logits, dtype=tf.int32, num_samples=1), axis=1 ) else: # Greedy decoding next_token = tf.math.argmax(next_token_logits, axis=-1, output_type=tf.int32) # is next token eos or in the sequence? if VERBOSE == True: print("The next token is {}".format(tokenizer.decode([int(next_token)]))) tmp_answer = answer + tokenizer.decode([int(next_token)]) forceReuse = True if not forceReuse: print("reuse not enforced") break remove_token = False # if end is detected if int(next_token) == eos_token_id: if EOS_1st: EOS_1st = False probs = tf.nn.softmax(next_token_logits[0]) next_score = probs[int(next_token)] if answer not in answers: answers.append(answer) if answers[-1] not in ["", "-"]: - scores.append(float( - np.power(output_score, (nb_gen - 1.0) / nb_gen) * np.power(next_score, - 1.0 / nb_gen))) + scores.append(getNewProba(probaMode, output_score, next_score, nb_gen=nb_gen)) else: scores.append(0.0) if VERBOSE == True: print("One result would be : {}, with proba {}".format(answers[-1], scores[-1])) remove_token = True else: if answer not in answers: answers.append(answer) if answers[-1] not in ["", "-"]: scores.append(float(output_score)) else: scores.append(0.0) if VERBOSE == True: print("Another result would be : {}, with proba {}".format(answers[-1], scores[-1])) answer = tmp_answer break # if probabillity is low (ignoring current token), or only forbiden tokens are proposed if output_score < 0.05 or nb_fails > 100: if answer not in answers: answers.append(answer) if answers[-1] not in ["", "-"]: scores.append(float(output_score)) else: scores.append(0.0) nb_fails = True if True: # VERBOSE == True: print( "No point to continue searching in ... : {}, with proba {}".format(answers[-1], scores[-1])) break # when predicting the first token, don't start multiple time at the same place if FIRST: FIRST = False if tmp_answer[0] in forbiden_first_token: if "nothing" not in VERBOSE: print("it started with the same letter") remove_token = True else: forbiden_first_token.append(tmp_answer[0]) # check that token is in input, and previous conditions are fullfulled (they would seet remove_token to True) if tmp_answer in context and not remove_token: answer = tmp_answer break # remove that token and search something else else: # remove that token as possibility nb_fails += 1 if "nothing" not in VERBOSE: print("this way the answer would not be in the context") print("remove token {}".format(int(next_token))) tmp = [[False for i in range(logistics_copy.shape[1])]] tmp[0][int(next_token)] = True logistics_copy = ge.set_tensor_by_indices_to_value( logistics_copy, tf.convert_to_tensor(tmp, dtype=tf.bool), -float("inf") ) probs = tf.nn.softmax(next_token_logits[0]) next_score = probs[int(next_token)] - output_score = np.power(output_score, (nb_gen - 1.0) / nb_gen) * np.power(next_score, 1.0 / nb_gen) + output_score = getNewProba(probaMode, output_score, next_score, nb_gen=nb_gen) if VERBOSE == True: print("output_score = {}, next_score = {}".format(output_score, next_score)) # print("The banned tokens are \n {}".format( # [tokenizer.decode([x]) for x in banned_tokens[0]])) # update generations and finished sentences if eos_token_id is not None: # pad finished sentences if eos_token_id exist tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents) else: tokens_to_add = next_token # add token and increase length by one input_ids = tf.concat([input_ids, tf.expand_dims(tokens_to_add, -1)], 1) cur_len = cur_len + 1 if eos_token_id is not None: eos_in_sents = tokens_to_add == eos_token_id # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length is_sents_unfinished_and_token_to_add_is_eos = tf.math.multiply( unfinished_sents, tf.cast(eos_in_sents, tf.int32) ) sent_lengths = ( sent_lengths * (1 - is_sents_unfinished_and_token_to_add_is_eos) + cur_len * is_sents_unfinished_and_token_to_add_is_eos ) # unfinished_sents is set to zero if eos in sentence unfinished_sents -= is_sents_unfinished_and_token_to_add_is_eos # stop when there is a in each sentence, or if we exceed the maximum length if tf.math.reduce_max(unfinished_sents) == 0 or nb_fails == True: break # extend attention_mask for new generated input if only decoder if self.config.is_encoder_decoder is False: attention_mask = tf.concat( [attention_mask, tf.ones((ge.shape_list(attention_mask)[0], 1), dtype=tf.int32)], axis=-1 ) # if there are different sentences lengths in the batch, some batches have to be padded min_sent_length = tf.math.reduce_min(sent_lengths) max_sent_length = tf.math.reduce_max(sent_lengths) if min_sent_length != max_sent_length: assert pad_token_id is not None, "`Pad_token_id` has to be defined if batches have different lengths" # finished sents are filled with pad_token padding = tf.ones([batch_size, max_sent_length.numpy()], dtype=tf.int32) * pad_token_id # create length masks for tf.where operation broad_casted_sent_lengths = tf.broadcast_to( tf.expand_dims(sent_lengths, -1), [batch_size, max_sent_length] ) broad_casted_range = tf.transpose( tf.broadcast_to(tf.expand_dims(tf.range(max_sent_length), -1), [max_sent_length, batch_size]) ) decoded = tf.where(broad_casted_range < broad_casted_sent_lengths, input_ids, padding) else: decoded = input_ids if "score" in VERBOSE: print(answers) print(scores) if "" in answers: print(" -- ") if '' in answers: print(" ---- ") # find most likely option if len(scores) > 0: max_ind = scores.index(max(scores)) return [answers[max_ind], scores[max_ind]] else: return ["None", 0] return decoded print("def calc banned ...") def calc_banned_ngram_tokens_modif(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len): # Copied from fairseq for no_repeat_ngram in beam_search if cur_len + 1 < no_repeat_ngram_size: # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet return [[] for _ in range(num_hypos)] generated_ngrams = [{} for _ in range(num_hypos)] for idx in range(num_hypos): gen_tokens = prev_input_ids[idx].numpy().tolist() generated_ngram = generated_ngrams[idx] for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]): prev_ngram_tuple = tuple(ngram[:-1]) generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]] def _get_generated_ngrams(hypo_idx): # Before decoding the next token, prevent decoding of ngrams that have already appeared start_idx = cur_len + 1 - no_repeat_ngram_size ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].numpy().tolist()) print("ngram_idx: {}".format(ngram_idx)) return generated_ngrams[hypo_idx].get(ngram_idx, []) banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)] return banned_tokens \ No newline at end of file