Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F91212782
tok.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Nov 9, 00:56
Size
2 KB
Mime Type
text/x-python
Expires
Mon, Nov 11, 00:56 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
22221492
Attached To
R11149 PDM-Nicola-Oulu
tok.py
View Options
import
os
import
json
from
tokenizers
import
ByteLevelBPETokenizer
from
transformers
import
GPT2Tokenizer
def
train_tok
(
gpt_path
,
vocab_size
):
tokenizer
=
ByteLevelBPETokenizer
()
os
.
chdir
(
gpt_path
)
print
(
"train tokenizer ..."
)
tokenizer
.
train
(
files
=
"train.txt"
,
vocab_size
=
vocab_size
,
min_frequency
=
2
,
special_tokens
=
[
"<input>"
,
"<pad>"
,
"<unk>"
,
"<find>"
,
"<answer>"
,
"<|endoftext|>"
])
print
(
"... done training"
)
# save tokenizer
if
not
os
.
path
.
exists
(
'tok'
):
os
.
makedirs
(
'tok'
)
tokenizer
.
save_model
(
"tok"
)
# get the path of the tokenizer files
os
.
chdir
(
'tok'
)
tok_path
=
os
.
getcwd
()
# print(tok_path)
os
.
chdir
(
gpt_path
)
# create the configuration files
config
=
{
"_num_labels"
:
2
,
"activation_function"
:
"gelu_new"
,
"architectures"
:
[
"GPT2LMHeadModel"
],
"attn_pdrop"
:
0.1
,
"do_sample"
:
False
,
"early_stopping"
:
False
,
"embd_pdrop"
:
0.1
,
"id2label"
:
{
"0"
:
"LABEL_0"
,
"1"
:
"LABEL_1"
},
"initializer_range"
:
0.02
,
"is_decoder"
:
False
,
"is_encoder_decoder"
:
False
,
"label2id"
:
{
"LABEL_0"
:
0
,
"LABEL_1"
:
1
},
"layer_norm_epsilon"
:
1e-05
,
"length_penalty"
:
1.0
,
"max_length"
:
20
,
"min_length"
:
0
,
"model_type"
:
"gpt2"
,
"n_ctx"
:
1024
,
"n_embd"
:
768
,
"n_head"
:
12
,
"n_layer"
:
12
,
"n_positions"
:
1024
,
"no_repeat_ngram_size"
:
0
,
"num_beams"
:
1
,
"num_return_sequences"
:
1
,
"output_attentions"
:
False
,
"output_hidden_states"
:
False
,
"output_past"
:
True
,
"pruned_heads"
:
{},
"repetition_penalty"
:
1.0
,
"resid_pdrop"
:
0.1
,
"summary_first_dropout"
:
0.1
,
"summary_proj_to_labels"
:
True
,
"summary_type"
:
"cls_index"
,
"summary_use_proj"
:
True
,
"temperature"
:
1.0
,
"top_k"
:
50
,
"top_p"
:
1.0
,
"torchscript"
:
False
,
"use_bfloat16"
:
False
,
"vocab_size"
:
50257
}
print
(
"save config"
)
with
open
(
tok_path
+
"/config.json"
,
'w'
)
as
fp
:
json
.
dump
(
config
,
fp
)
tokenizer_config
=
{
"max_len"
:
1024
}
with
open
(
tok_path
+
"/tokenizer_config.json"
,
'w'
)
as
fp
:
json
.
dump
(
tokenizer_config
,
fp
)
return
tok_path
def
get_tok
(
tok_path
):
tokenizer
=
GPT2Tokenizer
.
from_pretrained
(
tok_path
)
return
tokenizer
Event Timeline
Log In to Comment