File Metadata

Created: Wed, Jul 2, 16:03

tok.py
View Options

	import os
	import json

	from tokenizers import ByteLevelBPETokenizer

	from transformers import GPT2Tokenizer

	def train_tok(gpt_path, vocab_size):
	tokenizer = ByteLevelBPETokenizer()

	os.chdir(gpt_path)

	print("train tokenizer ...")
	tokenizer.train(files="train.txt", vocab_size=vocab_size, min_frequency=2,
	special_tokens=["<input>", "<pad>", "<unk>", "<find>", "<answer>", "<\|endoftext\|>"])
	print("... done training")

	# save tokenizer
	if not os.path.exists('tok'):
	os.makedirs('tok')
	tokenizer.save_model("tok")

	# get the path of the tokenizer files
	os.chdir('tok')
	tok_path = os.getcwd()
	# print(tok_path)

	os.chdir(gpt_path)

	# create the configuration files
	config = {
	"_num_labels": 2,
	"activation_function": "gelu_new",
	"architectures": [
	"GPT2LMHeadModel"
	],
	"attn_pdrop": 0.1,
	"do_sample": False,
	"early_stopping": False,
	"embd_pdrop": 0.1,
	"id2label": {
	"0": "LABEL_0",
	"1": "LABEL_1"
	},
	"initializer_range": 0.02,
	"is_decoder": False,
	"is_encoder_decoder": False,
	"label2id": {
	"LABEL_0": 0,
	"LABEL_1": 1
	},
	"layer_norm_epsilon": 1e-05,
	"length_penalty": 1.0,
	"max_length": 20,
	"min_length": 0,
	"model_type": "gpt2",
	"n_ctx": 1024,
	"n_embd": 768,
	"n_head": 12,
	"n_layer": 12,
	"n_positions": 1024,
	"no_repeat_ngram_size": 0,
	"num_beams": 1,
	"num_return_sequences": 1,
	"output_attentions": False,
	"output_hidden_states": False,
	"output_past": True,
	"pruned_heads": {},
	"repetition_penalty": 1.0,
	"resid_pdrop": 0.1,
	"summary_first_dropout": 0.1,
	"summary_proj_to_labels": True,
	"summary_type": "cls_index",
	"summary_use_proj": True,
	"temperature": 1.0,
	"top_k": 50,
	"top_p": 1.0,
	"torchscript": False,
	"use_bfloat16": False,
	"vocab_size": 50257
	}
	print("save config")
	with open(tok_path + "/config.json", 'w') as fp:
	json.dump(config, fp)

	tokenizer_config = {
	"max_len": 1024
	}
	with open(tok_path + "/tokenizer_config.json", 'w') as fp:
	json.dump(tokenizer_config, fp)

	return tok_path

	def get_tok(tok_path):
	tokenizer = GPT2Tokenizer.from_pretrained(tok_path)

	return tokenizer

tok.py
No OneTemporary
Actions

File Metadata

tok.py
View Options

Event Timeline

tok.pyNo OneTemporaryActions

File Metadata

tok.pyView Options

Event Timeline

tok.py
No OneTemporary
Actions

tok.py
View Options