Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F61391531
tokenization_albert_fast.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Mon, May 6, 09:14
Size
11 KB
Mime Type
text/x-python
Expires
Wed, May 8, 09:14 (2 d)
Engine
blob
Format
Raw Data
Handle
17505648
Attached To
R11484 ADDI
tokenization_albert_fast.py
View Options
# coding=utf-8
# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tokenization classes for ALBERT model."""
import
os
from
shutil
import
copyfile
from
typing
import
List
,
Optional
,
Tuple
from
...file_utils
import
is_sentencepiece_available
from
...tokenization_utils
import
AddedToken
from
...tokenization_utils_fast
import
PreTrainedTokenizerFast
from
...utils
import
logging
if
is_sentencepiece_available
():
from
.tokenization_albert
import
AlbertTokenizer
else
:
AlbertTokenizer
=
None
logger
=
logging
.
get_logger
(
__name__
)
VOCAB_FILES_NAMES
=
{
"vocab_file"
:
"spiece.model"
,
"tokenizer_file"
:
"tokenizer.json"
}
PRETRAINED_VOCAB_FILES_MAP
=
{
"vocab_file"
:
{
"albert-base-v1"
:
"https://huggingface.co/albert-base-v1/resolve/main/spiece.model"
,
"albert-large-v1"
:
"https://huggingface.co/albert-large-v1/resolve/main/spiece.model"
,
"albert-xlarge-v1"
:
"https://huggingface.co/albert-xlarge-v1/resolve/main/spiece.model"
,
"albert-xxlarge-v1"
:
"https://huggingface.co/albert-xxlarge-v1/resolve/main/spiece.model"
,
"albert-base-v2"
:
"https://huggingface.co/albert-base-v2/resolve/main/spiece.model"
,
"albert-large-v2"
:
"https://huggingface.co/albert-large-v2/resolve/main/spiece.model"
,
"albert-xlarge-v2"
:
"https://huggingface.co/albert-xlarge-v2/resolve/main/spiece.model"
,
"albert-xxlarge-v2"
:
"https://huggingface.co/albert-xxlarge-v2/resolve/main/spiece.model"
,
},
"tokenizer_file"
:
{
"albert-base-v1"
:
"https://huggingface.co/albert-base-v1/resolve/main/tokenizer.json"
,
"albert-large-v1"
:
"https://huggingface.co/albert-large-v1/resolve/main/tokenizer.json"
,
"albert-xlarge-v1"
:
"https://huggingface.co/albert-xlarge-v1/resolve/main/tokenizer.json"
,
"albert-xxlarge-v1"
:
"https://huggingface.co/albert-xxlarge-v1/resolve/main/tokenizer.json"
,
"albert-base-v2"
:
"https://huggingface.co/albert-base-v2/resolve/main/tokenizer.json"
,
"albert-large-v2"
:
"https://huggingface.co/albert-large-v2/resolve/main/tokenizer.json"
,
"albert-xlarge-v2"
:
"https://huggingface.co/albert-xlarge-v2/resolve/main/tokenizer.json"
,
"albert-xxlarge-v2"
:
"https://huggingface.co/albert-xxlarge-v2/resolve/main/tokenizer.json"
,
},
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
=
{
"albert-base-v1"
:
512
,
"albert-large-v1"
:
512
,
"albert-xlarge-v1"
:
512
,
"albert-xxlarge-v1"
:
512
,
"albert-base-v2"
:
512
,
"albert-large-v2"
:
512
,
"albert-xlarge-v2"
:
512
,
"albert-xxlarge-v2"
:
512
,
}
SPIECE_UNDERLINE
=
"▁"
class
AlbertTokenizerFast
(
PreTrainedTokenizerFast
):
"""
Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
<https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to lowercase the input when tokenizing.
remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`.
unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
sequence classification or for a text and a question for question answering. It is also used as the last
token of a sequence built with special tokens.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths.
cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The classifier token which is used when doing sequence classification (classification of the whole sequence
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
"""
vocab_files_names
=
VOCAB_FILES_NAMES
pretrained_vocab_files_map
=
PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes
=
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class
=
AlbertTokenizer
def
__init__
(
self
,
vocab_file
,
tokenizer_file
=
None
,
do_lower_case
=
True
,
remove_space
=
True
,
keep_accents
=
False
,
bos_token
=
"[CLS]"
,
eos_token
=
"[SEP]"
,
unk_token
=
"<unk>"
,
sep_token
=
"[SEP]"
,
pad_token
=
"<pad>"
,
cls_token
=
"[CLS]"
,
mask_token
=
"[MASK]"
,
**
kwargs
):
# Mask token behave like a normal word, i.e. include the space before it
mask_token
=
AddedToken
(
mask_token
,
lstrip
=
True
,
rstrip
=
False
)
if
isinstance
(
mask_token
,
str
)
else
mask_token
super
()
.
__init__
(
vocab_file
,
tokenizer_file
=
tokenizer_file
,
do_lower_case
=
do_lower_case
,
remove_space
=
remove_space
,
keep_accents
=
keep_accents
,
bos_token
=
bos_token
,
eos_token
=
eos_token
,
unk_token
=
unk_token
,
sep_token
=
sep_token
,
pad_token
=
pad_token
,
cls_token
=
cls_token
,
mask_token
=
mask_token
,
**
kwargs
,
)
self
.
do_lower_case
=
do_lower_case
self
.
remove_space
=
remove_space
self
.
keep_accents
=
keep_accents
self
.
vocab_file
=
vocab_file
def
build_inputs_with_special_tokens
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An ALBERT sequence has the following format:
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
cls
+
token_ids_0
+
sep
return
cls
+
token_ids_0
+
sep
+
token_ids_1
+
sep
def
get_special_tokens_mask
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
,
already_has_special_tokens
:
bool
=
False
)
->
List
[
int
]:
"""
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer ``prepare_for_model`` method.
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True if the token list is already formatted with special tokens for the model
Returns:
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
"""
if
already_has_special_tokens
:
if
token_ids_1
is
not
None
:
raise
ValueError
(
"You should not supply a second sequence if the provided sequence of "
"ids is already formatted with special tokens for the model."
)
return
list
(
map
(
lambda
x
:
1
if
x
in
[
self
.
sep_token_id
,
self
.
cls_token_id
]
else
0
,
token_ids_0
))
if
token_ids_1
is
not
None
:
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
+
([
0
]
*
len
(
token_ids_1
))
+
[
1
]
return
[
1
]
+
([
0
]
*
len
(
token_ids_0
))
+
[
1
]
def
create_token_type_ids_from_sequences
(
self
,
token_ids_0
:
List
[
int
],
token_ids_1
:
Optional
[
List
[
int
]]
=
None
)
->
List
[
int
]:
"""
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
if token_ids_1 is None, only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of ids.
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
sequence(s).
"""
sep
=
[
self
.
sep_token_id
]
cls
=
[
self
.
cls_token_id
]
if
token_ids_1
is
None
:
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
return
len
(
cls
+
token_ids_0
+
sep
)
*
[
0
]
+
len
(
token_ids_1
+
sep
)
*
[
1
]
def
save_vocabulary
(
self
,
save_directory
:
str
,
filename_prefix
:
Optional
[
str
]
=
None
)
->
Tuple
[
str
]:
if
not
os
.
path
.
isdir
(
save_directory
):
logger
.
error
(
"Vocabulary path ({}) should be a directory"
.
format
(
save_directory
))
return
out_vocab_file
=
os
.
path
.
join
(
save_directory
,
(
filename_prefix
+
"-"
if
filename_prefix
else
""
)
+
VOCAB_FILES_NAMES
[
"vocab_file"
]
)
if
os
.
path
.
abspath
(
self
.
vocab_file
)
!=
os
.
path
.
abspath
(
out_vocab_file
):
copyfile
(
self
.
vocab_file
,
out_vocab_file
)
return
(
out_vocab_file
,)
Event Timeline
Log In to Comment