Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91213593
main.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Nov 9, 01:07
Size
11 KB
Mime Type
text/x-python
Expires
Mon, Nov 11, 01:07 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22221672
Attached To
R11149 PDM-Nicola-Oulu
main.py
View Options
import
os
import
json
from
tokenizers
import
ByteLevelBPETokenizer
from
transformers
import
TFGPT2LMHeadModel
from
transformers
import
GPT2Tokenizer
import
matplotlib.pyplot
as
plt
from
transformers.generation_tf_utils
import
TFGenerationMixin
import
transformers.generation_tf_utils
as
ge
from
transformers
import
AutoTokenizer
,
AutoModelForQuestionAnswering
import
numpy
as
np
import
tensorflow
as
tf
import
tok
as
tok
import
load_files
as
lf
import
model
as
mo
import
model_xmlr
as
mox
import
modif_gpt
as
mod_gpt
import
torch
SCRATCH
=
False
MODEL
=
"gpt2"
home_path
=
os
.
getcwd
()
# create a folder to save the trained models
if
not
os
.
path
.
exists
(
'models'
):
os
.
makedirs
(
'models'
)
os
.
chdir
(
"./models"
)
model_path
=
os
.
getcwd
()
os
.
chdir
(
home_path
)
os
.
chdir
(
"./datasets/"
)
data_path
=
os
.
getcwd
()
os
.
chdir
(
home_path
)
# go to the folder with the gpt models
os
.
chdir
(
"./transformers/examples/"
)
os
.
chdir
(
"./language-modeling"
)
gpt_path
=
os
.
getcwd
()
# go to the folder with the gpt models
os
.
chdir
(
home_path
)
os
.
chdir
(
"./transformers/examples/"
)
os
.
chdir
(
"./question-answering"
)
bert_path
=
os
.
getcwd
()
# load a dataset
if
MODEL
==
"gpt2"
:
print
(
"load gpt2 dataset"
)
x_train
,
y_train
,
x_test
,
y_test
,
curr_data_path
=
lf
.
load_data
(
home_path
,
gpt_path
,
end
=
""
)
elif
MODEL
==
"xmlr"
:
print
(
"load roberta/xlmr dataset"
)
x_train
,
y_train
,
x_test
,
y_test
,
curr_data_path
=
lf
.
load_data
(
home_path
,
bert_path
,
end
=
""
)
else
:
print
(
"nothing loaded"
)
print
(
"!"
*
100
)
# train a custom tokenizer
if
SCRATCH
:
tok_path
=
tok
.
train_tok
(
gpt_path
,
1000
)
# train the network
name
=
"gpt_e_1"
#"xmlr_e_1"
if
SCRATCH
:
start
=
None
tok_loc
=
"tok"
elif
not
SCRATCH
:
if
MODEL
==
"gpt2"
:
start
=
"gpt2"
tok_loc
=
"gpt2"
if
MODEL
==
"xmlr"
:
start
=
"xlm-roberta-base"
tok_loc
=
"xlm-roberta-base"
if
False
:
if
MODEL
==
"gpt2"
:
os
.
chdir
(
gpt_path
)
mo
.
train
(
model_path
,
1
,
name
,
end
=
""
,
start
=
start
,
tok_loc
=
tok_loc
)
elif
MODEL
==
"xmlr"
:
os
.
chdir
(
bert_path
)
mox
.
train
(
model_path
,
1
,
name
,
end
=
""
,
start
=
start
)
# modify the hugginface code
TFGenerationMixin
.
_generate_no_beam_search
=
mod_gpt
.
_generate_no_beam_search_modif
TFGenerationMixin
.
generate
=
mod_gpt
.
generate_modif
def
calc_perf
(
lim
,
is_lim
,
corr_answer
,
given_answer
,
corr2
):
for
i
,
l
in
enumerate
(
lim
):
# calculate true positive, ect
if
is_lim
>
l
:
# confidence over treshold
if
corr_answer
==
given_answer
:
corr2
[
i
,
0
]
+=
1
else
:
corr2
[
i
,
1
]
+=
1
else
:
if
corr_answer
==
given_answer
:
corr2
[
i
,
2
]
+=
1
else
:
corr2
[
i
,
3
]
+=
1
return
corr2
def
print_perf
(
corr2
,
lim
,
console_out
,
samples
):
print
(
samples
)
print
(
corr2
[
0
])
# true positive ect table
tmp
=
"lim - true pos - false pos - false neg - true neg
\n
"
for
i
,
l
in
enumerate
(
lim
):
tmp
+=
str
(
l
)
+
" - "
for
v
in
corr2
[
i
]:
tmp
+=
str
(
int
(
100
/
samples
*
v
))
+
"% - "
# tmp -= "- "
tmp
+=
"
\n
"
print
(
tmp
)
console_out
+=
tmp
+
"
\n
"
# true positive ect table
tmp
=
"lim - F1 - recall - precision
\n
"
for
i
,
l
in
enumerate
(
lim
):
tmp
+=
str
(
l
)
+
" - "
recall
=
corr2
[
i
,
0
]
/
(
corr2
[
i
,
0
]
+
corr2
[
i
,
2
])
precision
=
corr2
[
i
,
0
]
/
(
corr2
[
i
,
0
]
+
corr2
[
i
,
1
])
F1
=
2
*
precision
*
recall
/
(
precision
+
recall
)
tmp
+=
str
(
F1
)
+
" - "
tmp
+=
str
(
recall
)
+
" - "
tmp
+=
str
(
precision
)
tmp
+=
"
\n
"
print
(
tmp
)
console_out
+=
tmp
+
"
\n
"
return
console_out
def
score
(
x
,
y
,
name
=
"model_small"
,
step
=
10
,
stop
=
100
,
console_name
=
""
):
console_out
=
""
samples
=
0
samples_c
=
0
samples_pc
=
0
x_sure
=
[]
y_sure
=
[]
x_unsure
=
[]
y_unsure
=
[]
eps
=
0.01
lim
=
[
.
1
,
.
2
,
.
3
,
.
4
,
.
5
,
.
6
,
.
7
,
.
8
,
.
9
]
count
=
[
eps
,
eps
,
eps
,
eps
,
eps
,
eps
,
eps
,
eps
,
eps
]
corr
=
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
]
corr2
=
np
.
zeros
((
len
(
lim
),
4
))
if
MODEL
==
"gpt2"
:
model
=
mo
.
get_model
(
model_path
,
name
)
if
SCRATCH
:
tokenizer
=
tok
.
get_tok
(
tok_path
)
else
:
tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
"gpt2"
)
for
ind
,
start_
in
enumerate
(
x
):
if
ind
>
stop
:
break
input_ids
=
tokenizer
.
encode
(
start_
,
return_tensors
=
'tf'
)
generated_text_samples
=
model
.
generate
(
input_ids
,
max_length
=
len
(
input_ids
[
0
])
+
50
,
num_return_sequences
=
1
,
no_repeat_ngram_size
=
0
,
repetition_penalty
=
1.0
,
top_p
=
1.0
,
temperature
=
1.0
,
do_sample
=
False
,
top_k
=
0
,
early_stopping
=
True
,
tokenizer
=
tokenizer
,
VERBOSE
=
VERBOSE
)
corr_answer
=
y
[
ind
][:
-
len
(
"<|endoftext|>"
)
-
1
]
given_answer
=
generated_text_samples
[
0
]
# answer correct with high proba
if
generated_text_samples
[
1
]
>
0.8
:
x_sure
.
append
(
start_
)
y_sure
.
append
(
given_answer
+
"<|endoftext|>"
)
else
:
x_unsure
.
append
(
start_
)
y_unsure
.
append
(
"<|endoftext|>"
)
os
.
chdir
(
curr_data_path
)
for
data_set
in
[
"train"
,
"test"
]:
with
open
(
"x_"
+
data_set
+
"_sure.json"
,
'w'
)
as
fp
:
json
.
dump
(
x_sure
,
fp
)
with
open
(
"y_"
+
data_set
+
"_sure.json"
,
'w'
)
as
fp
:
json
.
dump
(
y_sure
,
fp
)
with
open
(
"x_"
+
data_set
+
"_unsure.json"
,
'w'
)
as
fp
:
json
.
dump
(
x_unsure
,
fp
)
with
open
(
"y_"
+
data_set
+
"_unsure.json"
,
'w'
)
as
fp
:
json
.
dump
(
y_unsure
,
fp
)
samples
+=
1
for
i
,
l
in
enumerate
(
lim
):
if
generated_text_samples
[
1
]
>
l
:
if
count
[
i
]
>
eps
:
count
[
i
]
+=
1
else
:
count
[
i
]
=
1
corr2
=
calc_perf
(
lim
,
generated_text_samples
[
1
],
corr_answer
,
given_answer
,
corr2
)
if
corr_answer
==
given_answer
:
samples_c
+=
1
samples_pc
+=
1
for
i
,
l
in
enumerate
(
lim
):
if
generated_text_samples
[
1
]
>
l
:
corr
[
i
]
+=
1
elif
given_answer
in
corr_answer
:
samples_pc
+=
1
if
ind
%
step
==
0
or
ind
==
stop
:
tmp
=
"Score: {}, {}, {}"
.
format
(
samples
,
samples_c
,
samples_pc
)
print
(
tmp
)
console_out
+=
tmp
+
"
\n
"
tmp
=
"lim : {}
\n
prob: {}
\n
used: {}"
.
format
(
lim
,
[
int
(
100
/
count
[
i
]
*
corr
[
i
])
for
i
,
_
in
enumerate
(
lim
)],
[
int
(
100
/
samples
*
count
[
i
])
for
i
,
_
in
enumerate
(
lim
)])
print
(
tmp
)
console_out
+=
tmp
+
"
\n
"
tmp
=
"{} / {} "
.
format
(
corr_answer
,
given_answer
)
print
(
tmp
)
console_out
+=
tmp
+
"
\n
"
console_out
=
print_perf
(
corr2
,
lim
,
console_out
,
samples
)
os
.
chdir
(
curr_data_path
)
with
open
(
"console_out"
+
console_name
+
".txt"
,
'w'
)
as
f
:
f
.
write
(
console_out
)
# create a new dataset with good answers
return
samples
,
samples_c
,
samples_pc
def
score_xmlr
(
x
,
y
,
name
=
"model_small"
,
step
=
10
,
stop
=
100
,
console_name
=
""
):
console_out
=
""
samples
=
0
samples_c
=
0
samples_pc
=
0
eps
=
0.01
lim
=
[
x
/
10
for
x
in
range
(
10
)]
# lim = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
# count = np.zeros((len(lim),))
# count += eps
# count = [eps, eps, eps, eps, eps, eps, eps, eps, eps]
corr2
=
np
.
zeros
((
len
(
lim
),
4
))
# tmp = [0, 0, 0, 0]
# corr2 = [tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp]
if
MODEL
==
"xmlr"
:
model
,
tokenizer
=
mox
.
get_model
(
model_path
,
name
)
for
ind
,
start_
in
enumerate
(
x
):
if
ind
>
stop
:
break
tmp
=
x
[
ind
]
.
split
(
"---"
)
text
=
tmp
[
1
]
question
=
tmp
[
0
]
corr_answer
=
y
[
ind
]
inputs
=
tokenizer
(
question
,
text
,
return_tensors
=
'pt'
)
input_ids
=
inputs
[
"input_ids"
]
attention_mask
=
inputs
[
"attention_mask"
]
outputs
=
model
(
**
inputs
)
loss
=
outputs
.
loss
start_scores
=
outputs
.
start_logits
end_scores
=
outputs
.
end_logits
a
=
torch
.
argmax
(
start_scores
)
b
=
torch
.
argmax
(
end_scores
)
a
=
int
(
a
)
b
=
int
(
b
)
probs_a
=
tf
.
nn
.
softmax
(
start_scores
.
detach
())
probs_b
=
tf
.
nn
.
softmax
(
end_scores
.
detach
())
prob_a
=
probs_a
[
0
,
a
]
prob_b
=
probs_b
[
0
,
b
]
prob_ab
=
prob_a
*
prob_b
print
(
"a = {} with {} %, b = {} with {} %, combo {}"
.
format
(
a
,
prob_a
,
b
,
prob_b
,
prob_ab
))
# answer => 'a nice puppet' """
print
(
"correct answer: "
+
corr_answer
)
given_answer
=
tokenizer
.
decode
(
inputs
[
'input_ids'
][
0
][
int
(
a
):
int
(
b
)
+
1
])
print
(
"given answer: "
+
given_answer
)
samples
+=
1
corr2
=
calc_perf
(
lim
,
prob_ab
,
corr_answer
,
given_answer
,
corr2
)
if
given_answer
[
0
]
==
" "
:
given_answer
=
given_answer
[
1
:]
if
corr_answer
==
given_answer
:
samples_c
+=
1
samples_pc
+=
1
elif
given_answer
in
corr_answer
:
samples_pc
+=
1
if
ind
%
step
==
0
or
ind
==
stop
:
tmp
=
"Score: {}, {}, {}"
.
format
(
samples
,
samples_c
,
samples_pc
)
print
(
tmp
)
console_out
+=
tmp
+
"
\n
"
tmp
=
"{} / {} "
.
format
(
corr_answer
,
given_answer
)
print
(
tmp
)
console_out
+=
tmp
+
"
\n
"
console_out
=
print_perf
(
corr2
,
lim
,
console_out
,
samples
)
os
.
chdir
(
curr_data_path
)
with
open
(
"console_out"
+
console_name
+
".txt"
,
'w'
)
as
f
:
f
.
write
(
console_out
)
# create a new dataset with good answers
return
samples
,
samples_c
,
samples_pc
if
False
:
score_xmlr
(
x_test
,
y_test
,
name
=
name
,
step
=
10
,
stop
=
100
)
if
True
:
VERBOSE
=
"nothing_but_score"
score
(
x_test
,
y_test
,
name
=
name
,
step
=
10
,
stop
=
100
)
if
False
:
x_train
,
y_train
,
x_test
,
y_test
,
curr_data_path
=
lf
.
load_data
(
home_path
,
gpt_path
,
dir
=
"ch_full"
)
score
(
x_test
,
y_test
,
model
=
name
,
step
=
1
,
stop
=
1
)
x_train
,
y_train
,
x_test
,
y_test
,
curr_data_path
=
lf
.
load_data
(
home_path
,
gpt_path
,
end
=
"_fr"
,
dir
=
"fr_full_surname"
)
score
(
x_test
,
y_test
,
model
=
name
,
step
=
1
,
stop
=
1
)
x_train
,
y_train
,
x_test
,
y_test
,
curr_data_path
=
lf
.
load_data
(
home_path
,
gpt_path
,
end
=
"_fi"
,
dir
=
"fi_full_surname"
)
score
(
x_test
,
y_test
,
name
=
name
,
step
=
5
,
stop
=
100
,
console_name
=
"_on_ch"
)
name
=
"gpt_e_1_adapt"
start
=
"gpt_e_1"
_
,
_
,
_
,
_
,
_
=
lf
.
load_data
(
home_path
,
gpt_path
,
end
=
"_sure"
,
dir
=
"fi_full_surname"
)
os
.
chdir
(
gpt_path
)
mo
.
train
(
model_path
,
2
,
name
,
end
=
"_sure"
,
start
=
start
,
tok_loc
=
tok_loc
)
score
(
x_test
[
100
:],
y_test
[
100
:],
name
=
name
,
step
=
5
,
stop
=
100
,
console_name
=
"_on_ch_fi"
)
name
=
"gpt_e_1_adapt2"
start
=
"gpt_e_1_adapt"
_
,
_
,
_
,
_
,
_
=
lf
.
load_data
(
home_path
,
gpt_path
,
end
=
"_sure"
,
dir
=
"fi_full_surname"
)
os
.
chdir
(
gpt_path
)
mo
.
train
(
model_path
,
2
,
name
,
end
=
"_sure"
,
start
=
start
,
tok_loc
=
tok_loc
)
score
(
x_test
[
200
:],
y_test
[
200
:],
name
=
name
,
step
=
5
,
stop
=
100
,
console_name
=
"_on_ch_fi_fi"
)
Event Timeline
Log In to Comment