Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F120735582
run_xnli.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Jul 6, 16:28
Size
13 KB
Mime Type
text/x-python
Expires
Tue, Jul 8, 16:28 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
27219429
Attached To
R11484 ADDI
run_xnli.py
View Options
#!/usr/bin/env python
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning multi-lingual models on XNLI (e.g. Bert, DistilBERT, XLM).
Adapted from `examples/text-classification/run_glue.py`"""
import
logging
import
os
import
random
import
sys
from
dataclasses
import
dataclass
,
field
from
typing
import
Optional
import
numpy
as
np
from
datasets
import
load_dataset
,
load_metric
import
transformers
from
transformers
import
(
AutoConfig
,
AutoModelForSequenceClassification
,
AutoTokenizer
,
DataCollatorWithPadding
,
EvalPrediction
,
HfArgumentParser
,
Trainer
,
TrainingArguments
,
default_data_collator
,
set_seed
,
)
from
transformers.trainer_utils
import
get_last_checkpoint
,
is_main_process
logger
=
logging
.
getLogger
(
__name__
)
@dataclass
class
DataTrainingArguments
:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
Using `HfArgumentParser` we can turn this class
into argparse arguments to be able to specify them on
the command line.
"""
max_seq_length
:
Optional
[
int
]
=
field
(
default
=
128
,
metadata
=
{
"help"
:
"The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
overwrite_cache
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Overwrite the cached preprocessed datasets or not."
}
)
pad_to_max_length
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether to pad all samples to `max_seq_length`. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch."
},
)
max_train_samples
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For debugging purposes or quicker training, truncate the number of training examples to this "
"value if set."
},
)
max_val_samples
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For debugging purposes or quicker training, truncate the number of validation examples to this "
"value if set."
},
)
max_test_samples
:
Optional
[
int
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For debugging purposes or quicker training, truncate the number of test examples to this "
"value if set."
},
)
server_ip
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For distant debugging."
})
server_port
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"For distant debugging."
})
@dataclass
class
ModelArguments
:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path
:
str
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Path to pretrained model or model identifier from huggingface.co/models"
}
)
language
:
str
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Evaluation language. Also train language if `train_language` is set to None."
}
)
train_language
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Train language if it is different from the evaluation language."
}
)
config_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained config name or path if not the same as model_name"
}
)
tokenizer_name
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Pretrained tokenizer name or path if not the same as model_name"
}
)
cache_dir
:
Optional
[
str
]
=
field
(
default
=
None
,
metadata
=
{
"help"
:
"Where do you want to store the pretrained models downloaded from huggingface.co"
},
)
do_lower_case
:
Optional
[
bool
]
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"arg to indicate if tokenizer should do lower case in AutoTokenizer.from_pretrained()"
},
)
use_fast_tokenizer
:
bool
=
field
(
default
=
True
,
metadata
=
{
"help"
:
"Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."
},
)
model_revision
:
str
=
field
(
default
=
"main"
,
metadata
=
{
"help"
:
"The specific model version to use (can be a branch name, tag name or commit id)."
},
)
use_auth_token
:
bool
=
field
(
default
=
False
,
metadata
=
{
"help"
:
"Will use the token generated when running `transformers-cli login` (necessary to use this script "
"with private models)."
},
)
def
main
():
# See all possible arguments in src/transformers/training_args.py
# or by passing the --help flag to this script.
# We now keep distinct sets of args, for a cleaner separation of concerns.
parser
=
HfArgumentParser
((
ModelArguments
,
DataTrainingArguments
,
TrainingArguments
))
model_args
,
data_args
,
training_args
=
parser
.
parse_args_into_dataclasses
()
# Detecting last checkpoint.
last_checkpoint
=
None
if
os
.
path
.
isdir
(
training_args
.
output_dir
)
and
training_args
.
do_train
and
not
training_args
.
overwrite_output_dir
:
last_checkpoint
=
get_last_checkpoint
(
training_args
.
output_dir
)
if
last_checkpoint
is
None
and
len
(
os
.
listdir
(
training_args
.
output_dir
))
>
0
:
raise
ValueError
(
f
"Output directory ({training_args.output_dir}) already exists and is not empty. "
"Use --overwrite_output_dir to overcome."
)
elif
last_checkpoint
is
not
None
:
logger
.
info
(
f
"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
"the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
)
# Setup distant debugging if needed
if
data_args
.
server_ip
and
data_args
.
server_port
:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import
ptvsd
print
(
"Waiting for debugger attach"
)
ptvsd
.
enable_attach
(
address
=
(
data_args
.
server_ip
,
data_args
.
server_port
),
redirect_output
=
True
)
ptvsd
.
wait_for_attach
()
# Setup logging
logging
.
basicConfig
(
format
=
"
%(asctime)s
-
%(levelname)s
-
%(name)s
-
%(message)s
"
,
datefmt
=
"%m/
%d
/%Y %H:%M:%S"
,
handlers
=
[
logging
.
StreamHandler
(
sys
.
stdout
)],
)
logger
.
setLevel
(
logging
.
INFO
if
is_main_process
(
training_args
.
local_rank
)
else
logging
.
WARN
)
# Log on each process the small summary:
logger
.
warning
(
f
"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+
f
"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
)
# Set the verbosity to info of the Transformers logger (on main process only):
if
is_main_process
(
training_args
.
local_rank
):
transformers
.
utils
.
logging
.
set_verbosity_info
()
transformers
.
utils
.
logging
.
enable_default_handler
()
transformers
.
utils
.
logging
.
enable_explicit_format
()
logger
.
info
(
f
"Training/evaluation parameters {training_args}"
)
# Set seed before initializing model.
set_seed
(
training_args
.
seed
)
# In distributed training, the load_dataset function guarantees that only one local process can concurrently
# download the dataset.
# Downloading and loading xnli dataset from the hub.
if
model_args
.
train_language
is
None
:
train_dataset
=
load_dataset
(
"xnli"
,
model_args
.
language
,
split
=
"train"
)
else
:
train_dataset
=
load_dataset
(
"xnli"
,
model_args
.
train_language
,
split
=
"train"
)
eval_dataset
=
load_dataset
(
"xnli"
,
model_args
.
language
,
split
=
"validation"
)
# Labels
label_list
=
train_dataset
.
features
[
"label"
]
.
names
num_labels
=
len
(
label_list
)
# Load pretrained model and tokenizer
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config
=
AutoConfig
.
from_pretrained
(
model_args
.
config_name
if
model_args
.
config_name
else
model_args
.
model_name_or_path
,
num_labels
=
num_labels
,
finetuning_task
=
"xnli"
,
cache_dir
=
model_args
.
cache_dir
,
revision
=
model_args
.
model_revision
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_args
.
tokenizer_name
if
model_args
.
tokenizer_name
else
model_args
.
model_name_or_path
,
do_lower_case
=
model_args
.
do_lower_case
,
cache_dir
=
model_args
.
cache_dir
,
use_fast
=
model_args
.
use_fast_tokenizer
,
revision
=
model_args
.
model_revision
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
model_args
.
model_name_or_path
,
from_tf
=
bool
(
".ckpt"
in
model_args
.
model_name_or_path
),
config
=
config
,
cache_dir
=
model_args
.
cache_dir
,
revision
=
model_args
.
model_revision
,
use_auth_token
=
True
if
model_args
.
use_auth_token
else
None
,
)
# Preprocessing the datasets
# Padding strategy
if
data_args
.
pad_to_max_length
:
padding
=
"max_length"
else
:
# We will pad later, dynamically at batch creation, to the max sequence length in each batch
padding
=
False
def
preprocess_function
(
examples
):
# Tokenize the texts
return
tokenizer
(
examples
[
"premise"
],
examples
[
"hypothesis"
],
padding
=
padding
,
max_length
=
data_args
.
max_seq_length
,
truncation
=
True
,
)
if
training_args
.
do_train
:
if
data_args
.
max_train_samples
is
not
None
:
train_dataset
=
train_dataset
.
select
(
range
(
data_args
.
max_train_samples
))
train_dataset
=
train_dataset
.
map
(
preprocess_function
,
batched
=
True
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
)
if
training_args
.
do_eval
:
if
data_args
.
max_val_samples
is
not
None
:
eval_dataset
=
eval_dataset
.
select
(
range
(
data_args
.
max_val_samples
))
eval_dataset
=
eval_dataset
.
map
(
preprocess_function
,
batched
=
True
,
load_from_cache_file
=
not
data_args
.
overwrite_cache
,
)
# Log a few random samples from the training set:
for
index
in
random
.
sample
(
range
(
len
(
train_dataset
)),
3
):
logger
.
info
(
f
"Sample {index} of the training set: {train_dataset[index]}."
)
# Get the metric function
metric
=
load_metric
(
"xnli"
)
# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def
compute_metrics
(
p
:
EvalPrediction
):
preds
=
p
.
predictions
[
0
]
if
isinstance
(
p
.
predictions
,
tuple
)
else
p
.
predictions
preds
=
np
.
argmax
(
preds
,
axis
=
1
)
return
metric
.
compute
(
predictions
=
preds
,
references
=
p
.
label_ids
)
# Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding.
if
data_args
.
pad_to_max_length
:
data_collator
=
default_data_collator
elif
training_args
.
fp16
:
data_collator
=
DataCollatorWithPadding
(
tokenizer
,
pad_to_multiple_of
=
8
)
else
:
data_collator
=
None
# Initialize our Trainer
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
train_dataset
,
eval_dataset
=
eval_dataset
if
training_args
.
do_eval
else
None
,
compute_metrics
=
compute_metrics
,
tokenizer
=
tokenizer
,
data_collator
=
data_collator
,
)
# Training
if
training_args
.
do_train
:
if
last_checkpoint
is
not
None
:
model_path
=
last_checkpoint
elif
os
.
path
.
isdir
(
model_args
.
model_name_or_path
):
model_path
=
model_args
.
model_name_or_path
else
:
model_path
=
None
train_result
=
trainer
.
train
(
model_path
=
model_path
)
metrics
=
train_result
.
metrics
max_train_samples
=
(
data_args
.
max_train_samples
if
data_args
.
max_train_samples
is
not
None
else
len
(
train_dataset
)
)
metrics
[
"train_samples"
]
=
min
(
max_train_samples
,
len
(
train_dataset
))
trainer
.
save_model
()
# Saves the tokenizer too for easy upload
trainer
.
log_metrics
(
"train"
,
metrics
)
trainer
.
save_metrics
(
"train"
,
metrics
)
trainer
.
save_state
()
# Evaluation
if
training_args
.
do_eval
:
logger
.
info
(
"*** Evaluate ***"
)
metrics
=
trainer
.
evaluate
(
eval_dataset
=
eval_dataset
)
max_val_samples
=
data_args
.
max_val_samples
if
data_args
.
max_val_samples
is
not
None
else
len
(
eval_dataset
)
metrics
[
"eval_samples"
]
=
min
(
max_val_samples
,
len
(
eval_dataset
))
trainer
.
log_metrics
(
"eval"
,
metrics
)
trainer
.
save_metrics
(
"eval"
,
metrics
)
if
__name__
==
"__main__"
:
main
()
Event Timeline
Log In to Comment