Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F120542097
test_tokenization_auto.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jul 5, 02:53
Size
4 KB
Mime Type
text/x-python
Expires
Mon, Jul 7, 02:53 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
27195787
Attached To
R11484 ADDI
test_tokenization_auto.py
View Options
# coding=utf-8
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
from
transformers
import
(
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
,
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
,
AutoTokenizer
,
BertTokenizer
,
BertTokenizerFast
,
GPT2Tokenizer
,
GPT2TokenizerFast
,
RobertaTokenizer
,
RobertaTokenizerFast
,
)
from
transformers.models.auto.configuration_auto
import
AutoConfig
from
transformers.models.auto.tokenization_auto
import
TOKENIZER_MAPPING
from
transformers.models.roberta.configuration_roberta
import
RobertaConfig
from
transformers.testing_utils
import
(
DUMMY_DIFF_TOKENIZER_IDENTIFIER
,
DUMMY_UNKWOWN_IDENTIFIER
,
SMALL_MODEL_IDENTIFIER
,
require_tokenizers
,
slow
,
)
class
AutoTokenizerTest
(
unittest
.
TestCase
):
@slow
def
test_tokenizer_from_pretrained
(
self
):
for
model_name
in
(
x
for
x
in
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
()
if
"japanese"
not
in
x
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
tokenizer
)
self
.
assertIsInstance
(
tokenizer
,
(
BertTokenizer
,
BertTokenizerFast
))
self
.
assertGreater
(
len
(
tokenizer
),
0
)
for
model_name
in
GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
.
keys
():
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_name
)
self
.
assertIsNotNone
(
tokenizer
)
self
.
assertIsInstance
(
tokenizer
,
(
GPT2Tokenizer
,
GPT2TokenizerFast
))
self
.
assertGreater
(
len
(
tokenizer
),
0
)
def
test_tokenizer_from_pretrained_identifier
(
self
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
SMALL_MODEL_IDENTIFIER
)
self
.
assertIsInstance
(
tokenizer
,
(
BertTokenizer
,
BertTokenizerFast
))
self
.
assertEqual
(
tokenizer
.
vocab_size
,
12
)
def
test_tokenizer_from_model_type
(
self
):
tokenizer
=
AutoTokenizer
.
from_pretrained
(
DUMMY_UNKWOWN_IDENTIFIER
)
self
.
assertIsInstance
(
tokenizer
,
(
RobertaTokenizer
,
RobertaTokenizerFast
))
self
.
assertEqual
(
tokenizer
.
vocab_size
,
20
)
def
test_tokenizer_from_tokenizer_class
(
self
):
config
=
AutoConfig
.
from_pretrained
(
DUMMY_DIFF_TOKENIZER_IDENTIFIER
)
self
.
assertIsInstance
(
config
,
RobertaConfig
)
# Check that tokenizer_type ≠ model_type
tokenizer
=
AutoTokenizer
.
from_pretrained
(
DUMMY_DIFF_TOKENIZER_IDENTIFIER
,
config
=
config
)
self
.
assertIsInstance
(
tokenizer
,
(
BertTokenizer
,
BertTokenizerFast
))
self
.
assertEqual
(
tokenizer
.
vocab_size
,
12
)
@require_tokenizers
def
test_tokenizer_identifier_with_correct_config
(
self
):
for
tokenizer_class
in
[
BertTokenizer
,
BertTokenizerFast
,
AutoTokenizer
]:
tokenizer
=
tokenizer_class
.
from_pretrained
(
"wietsedv/bert-base-dutch-cased"
)
self
.
assertIsInstance
(
tokenizer
,
(
BertTokenizer
,
BertTokenizerFast
))
if
isinstance
(
tokenizer
,
BertTokenizer
):
self
.
assertEqual
(
tokenizer
.
basic_tokenizer
.
do_lower_case
,
False
)
else
:
self
.
assertEqual
(
tokenizer
.
do_lower_case
,
False
)
self
.
assertEqual
(
tokenizer
.
model_max_length
,
512
)
@require_tokenizers
def
test_tokenizer_identifier_non_existent
(
self
):
for
tokenizer_class
in
[
BertTokenizer
,
BertTokenizerFast
,
AutoTokenizer
]:
with
self
.
assertRaises
(
EnvironmentError
):
_
=
tokenizer_class
.
from_pretrained
(
"julien-c/herlolip-not-exists"
)
def
test_parents_and_children_in_mappings
(
self
):
# Test that the children are placed before the parents in the mappings, as the `instanceof` will be triggered
# by the parents and will return the wrong configuration type when using auto models
mappings
=
(
TOKENIZER_MAPPING
,)
for
mapping
in
mappings
:
mapping
=
tuple
(
mapping
.
items
())
for
index
,
(
child_config
,
_
)
in
enumerate
(
mapping
[
1
:]):
for
parent_config
,
_
in
mapping
[:
index
+
1
]:
with
self
.
subTest
(
msg
=
"Testing if {} is child of {}"
.
format
(
child_config
.
__name__
,
parent_config
.
__name__
)
):
self
.
assertFalse
(
issubclass
(
child_config
,
parent_config
))
@require_tokenizers
def
test_from_pretrained_use_fast_toggle
(
self
):
self
.
assertIsInstance
(
AutoTokenizer
.
from_pretrained
(
"bert-base-cased"
,
use_fast
=
False
),
BertTokenizer
)
self
.
assertIsInstance
(
AutoTokenizer
.
from_pretrained
(
"bert-base-cased"
),
BertTokenizerFast
)
Event Timeline
Log In to Comment