Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61239299
configuration_mt5.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, May 5, 10:15
Size
5 KB
Mime Type
text/x-python
Expires
Tue, May 7, 10:15 (2 d)
Engine
blob
Format
Raw Data
Handle
17484308
Attached To
R11484 ADDI
configuration_mt5.py
View Options
# coding=utf-8
# Copyright 2020, The T5 Authors and HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" mT5 model configuration """
from
...configuration_utils
import
PretrainedConfig
from
...utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
class
MT5Config
(
PretrainedConfig
):
r"""
This is the configuration class to store the configuration of a :class:`~transformers.MT5Model` or a
:class:`~transformers.TFMT5Model`. It is used to instantiate a mT5 model according to the specified arguments,
defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
to that of the mT5 `google/mt5-small <https://huggingface.co/google/mt5-small>`__ architecture.
Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
Arguments:
vocab_size (:obj:`int`, `optional`, defaults to 32128):
Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
:obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`.
d_model (:obj:`int`, `optional`, defaults to 512):
Size of the encoder layers and the pooler layer.
d_kv (:obj:`int`, `optional`, defaults to 64):
Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
// num_heads`.
d_ff (:obj:`int`, `optional`, defaults to 1024):
Size of the intermediate feed forward layer in each :obj:`T5Block`.
num_layers (:obj:`int`, `optional`, defaults to 8):
Number of hidden layers in the Transformer encoder.
num_decoder_layers (:obj:`int`, `optional`):
Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
set.
num_heads (:obj:`int`, `optional`, defaults to 6):
Number of attention heads for each attention layer in the Transformer encoder.
relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
The number of buckets to use for each attention layer.
dropout_rate (:obj:`float`, `optional`, defaults to 0.1):
The ratio for all dropout layers.
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
The epsilon used by the layer normalization layers.
initializer_factor (:obj:`float`, `optional`, defaults to 1):
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
testing).
feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`):
Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`.
use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not the model should return the last key/values attentions (not used by all models).
"""
model_type
=
"mt5"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
def
__init__
(
self
,
vocab_size
=
250112
,
d_model
=
512
,
d_kv
=
64
,
d_ff
=
1024
,
num_layers
=
8
,
num_decoder_layers
=
None
,
num_heads
=
6
,
relative_attention_num_buckets
=
32
,
dropout_rate
=
0.1
,
layer_norm_epsilon
=
1e-6
,
initializer_factor
=
1.0
,
feed_forward_proj
=
"gated-gelu"
,
is_encoder_decoder
=
True
,
use_cache
=
True
,
tokenizer_class
=
"T5Tokenizer"
,
tie_word_embeddings
=
False
,
pad_token_id
=
0
,
eos_token_id
=
1
,
decoder_start_token_id
=
0
,
**
kwargs
):
super
()
.
__init__
(
is_encoder_decoder
=
is_encoder_decoder
,
tokenizer_class
=
tokenizer_class
,
tie_word_embeddings
=
tie_word_embeddings
,
pad_token_id
=
pad_token_id
,
eos_token_id
=
eos_token_id
,
decoder_start_token_id
=
decoder_start_token_id
,
**
kwargs
,
)
self
.
vocab_size
=
vocab_size
self
.
d_model
=
d_model
self
.
d_kv
=
d_kv
self
.
d_ff
=
d_ff
self
.
num_layers
=
num_layers
self
.
num_decoder_layers
=
(
num_decoder_layers
if
num_decoder_layers
is
not
None
else
self
.
num_layers
)
# default = symmetry
self
.
num_heads
=
num_heads
self
.
relative_attention_num_buckets
=
relative_attention_num_buckets
self
.
dropout_rate
=
dropout_rate
self
.
layer_norm_epsilon
=
layer_norm_epsilon
self
.
initializer_factor
=
initializer_factor
self
.
feed_forward_proj
=
feed_forward_proj
self
.
use_cache
=
use_cache
@property
def
hidden_size
(
self
):
return
self
.
d_model
@property
def
num_attention_heads
(
self
):
return
self
.
num_heads
@property
def
num_hidden_layers
(
self
):
return
self
.
num_layers
Event Timeline
Log In to Comment