Page MenuHomec4science

main.py
No OneTemporary

File Metadata

Created
Sat, Nov 9, 01:07
import os
import json
from tokenizers import ByteLevelBPETokenizer
from transformers import TFGPT2LMHeadModel
from transformers import GPT2Tokenizer
import matplotlib.pyplot as plt
from transformers.generation_tf_utils import TFGenerationMixin
import transformers.generation_tf_utils as ge
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import numpy as np
import tensorflow as tf
import tok as tok
import load_files as lf
import model as mo
import model_xmlr as mox
import modif_gpt as mod_gpt
import torch
SCRATCH = False
MODEL = "gpt2"
home_path = os.getcwd()
# create a folder to save the trained models
if not os.path.exists('models'):
os.makedirs('models')
os.chdir("./models")
model_path = os.getcwd()
os.chdir(home_path)
os.chdir("./datasets/")
data_path = os.getcwd()
os.chdir(home_path)
# go to the folder with the gpt models
os.chdir("./transformers/examples/")
os.chdir("./language-modeling")
gpt_path = os.getcwd()
# go to the folder with the gpt models
os.chdir(home_path)
os.chdir("./transformers/examples/")
os.chdir("./question-answering")
bert_path = os.getcwd()
# load a dataset
if MODEL == "gpt2":
print("load gpt2 dataset")
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, end="")
elif MODEL == "xmlr":
print("load roberta/xlmr dataset")
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, bert_path, end="")
else:
print("nothing loaded")
print("!"*100)
# train a custom tokenizer
if SCRATCH:
tok_path = tok.train_tok(gpt_path, 1000)
# train the network
name = "gpt_e_1" #"xmlr_e_1"
if SCRATCH:
start = None
tok_loc = "tok"
elif not SCRATCH:
if MODEL == "gpt2":
start = "gpt2"
tok_loc = "gpt2"
if MODEL == "xmlr":
start = "xlm-roberta-base"
tok_loc = "xlm-roberta-base"
if False:
if MODEL == "gpt2":
os.chdir(gpt_path)
mo.train(model_path, 1, name, end="", start=start, tok_loc = tok_loc)
elif MODEL == "xmlr":
os.chdir(bert_path)
mox.train(model_path, 1, name, end="", start=start)
# modify the hugginface code
TFGenerationMixin._generate_no_beam_search = mod_gpt._generate_no_beam_search_modif
TFGenerationMixin.generate = mod_gpt.generate_modif
def calc_perf(lim, is_lim, corr_answer, given_answer, corr2):
for i, l in enumerate(lim):
# calculate true positive, ect
if is_lim > l:
# confidence over treshold
if corr_answer == given_answer:
corr2[i,0] += 1
else:
corr2[i,1] += 1
else:
if corr_answer == given_answer:
corr2[i,2] += 1
else:
corr2[i,3] += 1
return corr2
def print_perf(corr2, lim, console_out, samples):
print(samples)
print(corr2[0])
# true positive ect table
tmp = "lim - true pos - false pos - false neg - true neg\n"
for i, l in enumerate(lim):
tmp += str(l) + " - "
for v in corr2[i]:
tmp += str(int(100 / samples * v)) + "% - "
# tmp -= "- "
tmp += "\n"
print(tmp)
console_out += tmp + "\n"
# true positive ect table
tmp = "lim - F1 - recall - precision \n"
for i, l in enumerate(lim):
tmp += str(l) + " - "
recall = corr2[i,0] / (corr2[i,0] + corr2[i,2])
precision = corr2[i,0] / (corr2[i,0] + corr2[i,1])
F1 = 2* precision * recall / (precision + recall)
tmp += str(F1) + " - "
tmp += str(recall) + " - "
tmp += str(precision)
tmp += "\n"
print(tmp)
console_out += tmp + "\n"
return console_out
def score(x, y, name="model_small", step=10, stop=100, console_name=""):
console_out = ""
samples = 0
samples_c = 0
samples_pc = 0
x_sure = []
y_sure = []
x_unsure = []
y_unsure = []
eps = 0.01
lim = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
count = [eps, eps, eps, eps, eps, eps, eps, eps, eps]
corr = [0, 0, 0, 0, 0, 0, 0, 0, 0]
corr2 = np.zeros((len(lim), 4))
if MODEL == "gpt2":
model = mo.get_model(model_path, name)
if SCRATCH:
tokenizer = tok.get_tok(tok_path)
else:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
for ind, start_ in enumerate(x):
if ind > stop:
break
input_ids = tokenizer.encode(start_, return_tensors='tf')
generated_text_samples = model.generate(
input_ids,
max_length=len(input_ids[0]) + 50,
num_return_sequences=1,
no_repeat_ngram_size=0,
repetition_penalty=1.0,
top_p=1.0,
temperature=1.0,
do_sample=False,
top_k=0,
early_stopping=True,
tokenizer=tokenizer,
VERBOSE=VERBOSE
)
corr_answer = y[ind][:-len("<|endoftext|>") - 1]
given_answer = generated_text_samples[0]
# answer correct with high proba
if generated_text_samples[1] > 0.8:
x_sure.append(start_)
y_sure.append(given_answer + "<|endoftext|>")
else:
x_unsure.append(start_)
y_unsure.append("<|endoftext|>")
os.chdir(curr_data_path)
for data_set in ["train", "test"]:
with open("x_"+data_set+"_sure.json", 'w') as fp:
json.dump(x_sure, fp)
with open("y_"+data_set+"_sure.json", 'w') as fp:
json.dump(y_sure, fp)
with open("x_"+data_set+"_unsure.json", 'w') as fp:
json.dump(x_unsure, fp)
with open("y_"+data_set+"_unsure.json", 'w') as fp:
json.dump(y_unsure, fp)
samples += 1
for i, l in enumerate(lim):
if generated_text_samples[1] > l:
if count[i] > eps:
count[i] += 1
else:
count[i] = 1
corr2 = calc_perf(lim, generated_text_samples[1], corr_answer, given_answer, corr2)
if corr_answer == given_answer:
samples_c += 1
samples_pc += 1
for i, l in enumerate(lim):
if generated_text_samples[1] > l:
corr[i] += 1
elif given_answer in corr_answer:
samples_pc += 1
if ind % step == 0 or ind == stop:
tmp="Score: {}, {}, {}".format(samples, samples_c, samples_pc)
print(tmp)
console_out += tmp + "\n"
tmp="lim : {} \n prob: {} \n used: {}".format(lim,
[int(100 / count[i] * corr[i]) for i, _ in enumerate(lim)],
[int(100 / samples * count[i]) for i, _ in enumerate(lim)])
print(tmp)
console_out += tmp + "\n"
tmp = "{} / {} ".format(corr_answer, given_answer)
print(tmp)
console_out += tmp + "\n"
console_out = print_perf(corr2, lim, console_out, samples)
os.chdir(curr_data_path)
with open("console_out" + console_name + ".txt", 'w') as f:
f.write(console_out)
# create a new dataset with good answers
return samples, samples_c, samples_pc
def score_xmlr(x, y, name="model_small", step=10, stop=100, console_name=""):
console_out = ""
samples = 0
samples_c = 0
samples_pc = 0
eps = 0.01
lim = [x/10 for x in range(10)]
# lim = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
# count = np.zeros((len(lim),))
# count += eps
# count = [eps, eps, eps, eps, eps, eps, eps, eps, eps]
corr2 = np.zeros((len(lim), 4))
# tmp = [0, 0, 0, 0]
# corr2 = [tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp]
if MODEL == "xmlr":
model, tokenizer = mox.get_model(model_path, name)
for ind, start_ in enumerate(x):
if ind > stop:
break
tmp = x[ind].split("---")
text = tmp[1]
question = tmp[0]
corr_answer = y[ind]
inputs = tokenizer(question, text, return_tensors='pt')
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]
outputs = model(**inputs)
loss = outputs.loss
start_scores = outputs.start_logits
end_scores = outputs.end_logits
a = torch.argmax(start_scores)
b = torch.argmax(end_scores)
a = int(a)
b = int(b)
probs_a = tf.nn.softmax(start_scores.detach())
probs_b = tf.nn.softmax(end_scores.detach())
prob_a = probs_a[0, a]
prob_b = probs_b[0, b]
prob_ab = prob_a*prob_b
print("a = {} with {} %, b = {} with {} %, combo {}".format(a, prob_a, b, prob_b, prob_ab))
# answer => 'a nice puppet' """
print("correct answer: " + corr_answer)
given_answer = tokenizer.decode(inputs['input_ids'][0][int(a):int(b) + 1])
print("given answer: " + given_answer)
samples += 1
corr2 = calc_perf(lim, prob_ab, corr_answer, given_answer, corr2)
if given_answer[0] == " ":
given_answer = given_answer[1:]
if corr_answer == given_answer:
samples_c += 1
samples_pc += 1
elif given_answer in corr_answer:
samples_pc += 1
if ind % step == 0 or ind == stop:
tmp = "Score: {}, {}, {}".format(samples, samples_c, samples_pc)
print(tmp)
console_out += tmp + "\n"
tmp = "{} / {} ".format(corr_answer, given_answer)
print(tmp)
console_out += tmp + "\n"
console_out = print_perf(corr2, lim, console_out, samples)
os.chdir(curr_data_path)
with open("console_out" + console_name + ".txt", 'w') as f:
f.write(console_out)
# create a new dataset with good answers
return samples, samples_c, samples_pc
if False:
score_xmlr(x_test, y_test, name=name, step=10, stop=100)
if True:
VERBOSE = "nothing_but_score"
score(x_test, y_test, name=name, step=10, stop=100)
if False:
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, dir="ch_full")
score(x_test, y_test, model=name, step=1, stop=1)
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, end="_fr", dir="fr_full_surname")
score(x_test, y_test, model=name, step=1, stop=1)
x_train, y_train, x_test, y_test, curr_data_path = lf.load_data(home_path, gpt_path, end="_fi", dir="fi_full_surname")
score(x_test, y_test, name=name, step=5, stop=100, console_name="_on_ch")
name = "gpt_e_1_adapt"
start = "gpt_e_1"
_, _, _, _, _ = lf.load_data(home_path, gpt_path, end="_sure", dir="fi_full_surname")
os.chdir(gpt_path)
mo.train(model_path, 2, name, end="_sure", start=start, tok_loc = tok_loc)
score(x_test[100:], y_test[100:], name=name, step=5, stop=100, console_name="_on_ch_fi")
name = "gpt_e_1_adapt2"
start = "gpt_e_1_adapt"
_, _, _, _, _ = lf.load_data(home_path, gpt_path, end="_sure", dir="fi_full_surname")
os.chdir(gpt_path)
mo.train(model_path, 2, name, end="_sure", start=start, tok_loc = tok_loc)
score(x_test[200:], y_test[200:], name=name, step=5, stop=100, console_name="_on_ch_fi_fi")

Event Timeline