Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F76533788
tokenization_t5.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Aug 8, 13:26
Size
11 KB
Mime Type
text/x-python
Expires
Sat, Aug 10, 13:26 (2 d)
Engine
blob
Format
Raw Data
Handle
19727333
Attached To
R11484 ADDI
tokenization_t5.py
View Options
# coding=utf-8
# Copyright 2018 T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization class for model T5."""
import
os
import
re
import
warnings
from
shutil
import
copyfile
from
typing
import
List
,
Optional
,
Tuple
import
sentencepiece
as
spm
from
...tokenization_utils
import
PreTrainedTokenizer
from
...utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
VOCAB_FILES_NAMES
=
{
"vocab_file"
:
"spiece.model"
}
PRETRAINED_VOCAB_FILES_MAP
=
{
"vocab_file"
:
{
"t5-small"
:
"https://huggingface.co/t5-small/resolve/main/spiece.model"
,
"t5-base"
:
"https://huggingface.co/t5-base/resolve/main/spiece.model"
,
"t5-large"
:
"https://huggingface.co/t5-large/resolve/main/spiece.model"
,
"t5-3b"
:
"https://huggingface.co/t5-3b/resolve/main/spiece.model"
,
"t5-11b"
:
"https://huggingface.co/t5-11b/resolve/main/spiece.model"
,
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
"t5-small"
:
512
,
"t5-base"
:
512
,
"t5-large"
:
512
,
"t5-3b"
:
512
,
"t5-11b"
:
512
,
}
class
T5Tokenizer
(
PreTrainedTokenizer
):
"""
Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
Users should refer to this superclass for more information regarding those methods.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
The end of sequence token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the end of
sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
extra_ids (:obj:`int`, `optional`, defaults to 100):
Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
like in T5 preprocessing see `here
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names
=
[
"input_ids"
,
"attention_mask"
]
def
__init__
(
self
,
vocab_file
,
eos_token
=
"</s>"
,
unk_token
=
"<unk>"
,
pad_token
=
"<pad>"
,
extra_ids
=
100
,
additional_special_tokens
=
None
,
**
kwargs
):
# Add extra_ids to the special token list
if
extra_ids
>
0
and
additional_special_tokens
is
None
:
additional_special_tokens
=
[
"<extra_id_{}>"
.
format
(
i
)
for
i
in
range
(
extra_ids
)]
elif
extra_ids
>
0
and
additional_special_tokens
is
not
None
:
# Check that we have the right number of extra_id special tokens
extra_tokens
=
len
(
set
(
filter
(
lambda
x
:
bool
(
"extra_id"
in
x
),
additional_special_tokens
)))
if
extra_tokens
!=
extra_ids
:
raise
ValueError
(
f
"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are provided to T5Tokenizer. "
"In this case the additional_special_tokens must include the extra_ids tokens"
)
super
()
.
__init__
(
eos_token
=
eos_token
,
unk_token
=
unk_token
,
pad_token
=
pad_token
,
extra_ids
=
extra_ids
,
additional_special_tokens
=
additional_special_tokens
,
**
kwargs
,
)
self
.
vocab_file
=
vocab_file
self
.
_extra_ids
=
extra_ids
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
vocab_file
)
@property
def
vocab_size
(
self
):
return
self
.
sp_model
.
get_piece_size
()
+
self
.
_extra_ids
def
get_vocab
(
self
):
vocab
=
{
self
.
convert_ids_to_tokens
(
i
):
i
for
i
in
range
(
self
.
vocab_size
)}
vocab
.
update
(
self
.
added_tokens_encoder
)
return
vocab
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not the token list is already formatted with special tokens for the model.
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
# normal case: some special tokens
if
token_ids_1
is
None
:
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
return
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
def
_add_eos_if_not_present
(
self
,
token_ids
:
List
[
int
])
->
List
[
int
]:
"""Do not add eos again if user already added it."""
if
len
(
token_ids
)
>
0
and
token_ids
[
-
1
]
==
self
.
eos_token_id
:
warnings
.
warn
(
f
"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated eos tokens being added."
)
return
token_ids
else
:
return
token_ids
+
[
self
.
eos_token_id
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
use of token type ids, therefore a list of zeros is returned.
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of zeros.
"""
eos
=
[
self
.
eos_token_id
]
if
token_ids_1
is
None
:
return
len
(
token_ids_0
+
eos
)
*
[
0
]
return
len
(
token_ids_0
+
eos
+
token_ids_1
+
eos
)
*
[
0
]
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: ``X </s>``
- pair of sequences: ``A </s> B </s>``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
token_ids_0
=
self
.
_add_eos_if_not_present
(
token_ids_0
)
if
token_ids_1
is
None
:
return
token_ids_0
else
:
token_ids_1
=
self
.
_add_eos_if_not_present
(
token_ids_1
)
return
token_ids_0
+
token_ids_1
def
__getstate__
(
self
):
state
=
self
.
__dict__
.
copy
()
state
[
"sp_model"
]
=
None
return
state
def
__setstate__
(
self
,
d
):
self
.
__dict__
=
d
self
.
sp_model
=
spm
.
SentencePieceProcessor
()
self
.
sp_model
.
Load
(
self
.
vocab_file
)
def
_tokenize
(
self
,
text
,
sample
=
False
):
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
if
not
sample
:
pieces
=
self
.
sp_model
.
EncodeAsPieces
(
text
)
else
:
pieces
=
self
.
sp_model
.
SampleEncodeAsPieces
(
text
,
64
,
0.1
)
return
pieces
def
_convert_token_to_id
(
self
,
token
):
""" Converts a token (str) in an id using the vocab. """
if
token
.
startswith
(
"<extra_id_"
):
match
=
re
.
match
(
r"<extra_id_(\d+)>"
,
token
)
num
=
int
(
match
.
group
(
1
))
return
self
.
vocab_size
-
num
-
1
return
self
.
sp_model
.
piece_to_id
(
token
)
def
_convert_id_to_token
(
self
,
index
):
"""Converts an index (integer) in a token (str) using the vocab."""
if
index
<
self
.
sp_model
.
get_piece_size
():
token
=
self
.
sp_model
.
IdToPiece
(
index
)
else
:
token
=
"<extra_id_{}>"
.
format
(
self
.
vocab_size
-
1
-
index
)
return
token
def
convert_tokens_to_string
(
self
,
tokens
):
""" Converts a sequence of tokens (string) in a single string. """
current_sub_tokens
=
[]
out_string
=
""
for
token
in
tokens
:
# make sure that special tokens are not decoded using sentencepiece model
if
token
in
self
.
all_special_tokens
:
out_string
+=
self
.
sp_model
.
decode_pieces
(
current_sub_tokens
)
+
token
+
" "
current_sub_tokens
=
[]
else
:
current_sub_tokens
.
append
(
token
)
out_string
+=
self
.
sp_model
.
decode_pieces
(
current_sub_tokens
)
return
out_string
.
strip
()
def
save_vocabulary
(
self
,
save_directory
:
str
,
filename_prefix
:
Optional
[
str
]
=
None
)
->
Tuple
[
str
]:
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
(
filename_prefix
+
"-"
if
filename_prefix
else
""
)
+
VOCAB_FILES_NAMES
[
"vocab_file"
]
)
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
logger
.
info
(
f
"Copy vocab file to {out_vocab_file}"
)
return
(
out_vocab_file
,)
Event Timeline
Log In to Comment