Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F120535310
test_tokenization_bert.py
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sat, Jul 5, 02:00
Size
11 KB
Mime Type
text/x-python
Expires
Mon, Jul 7, 02:00 (1 d, 23 h)
Engine
blob
Format
Raw Data
Handle
27198760
Attached To
R11484 ADDI
test_tokenization_bert.py
View Options
# coding=utf-8
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
unittest
from
transformers
import
BertTokenizerFast
from
transformers.models.bert.tokenization_bert
import
(
VOCAB_FILES_NAMES
,
BasicTokenizer
,
BertTokenizer
,
WordpieceTokenizer
,
_is_control
,
_is_punctuation
,
_is_whitespace
,
)
from
transformers.testing_utils
import
require_tokenizers
,
slow
from
.test_tokenization_common
import
TokenizerTesterMixin
,
filter_non_english
@require_tokenizers
class
BertTokenizationTest
(
TokenizerTesterMixin
,
unittest
.
TestCase
):
tokenizer_class
=
BertTokenizer
rust_tokenizer_class
=
BertTokenizerFast
test_rust_tokenizer
=
True
space_between_special_tokens
=
True
from_pretrained_filter
=
filter_non_english
def
setUp
(
self
):
super
()
.
setUp
()
vocab_tokens
=
[
"[UNK]"
,
"[CLS]"
,
"[SEP]"
,
"[PAD]"
,
"[MASK]"
,
"want"
,
"##want"
,
"##ed"
,
"wa"
,
"un"
,
"runn"
,
"##ing"
,
","
,
"low"
,
"lowest"
,
]
self
.
vocab_file
=
os
.
path
.
join
(
self
.
tmpdirname
,
VOCAB_FILES_NAMES
[
"vocab_file"
])
with
open
(
self
.
vocab_file
,
"w"
,
encoding
=
"utf-8"
)
as
vocab_writer
:
vocab_writer
.
write
(
""
.
join
([
x
+
"
\n
"
for
x
in
vocab_tokens
]))
def
get_input_output_texts
(
self
,
tokenizer
):
input_text
=
"UNwant
\u00E9
d,running"
output_text
=
"unwanted, running"
return
input_text
,
output_text
def
test_full_tokenizer
(
self
):
tokenizer
=
self
.
tokenizer_class
(
self
.
vocab_file
)
tokens
=
tokenizer
.
tokenize
(
"UNwant
\u00E9
d,running"
)
self
.
assertListEqual
(
tokens
,
[
"un"
,
"##want"
,
"##ed"
,
","
,
"runn"
,
"##ing"
])
self
.
assertListEqual
(
tokenizer
.
convert_tokens_to_ids
(
tokens
),
[
9
,
6
,
7
,
12
,
10
,
11
])
def
test_rust_and_python_full_tokenizers
(
self
):
if
not
self
.
test_rust_tokenizer
:
return
tokenizer
=
self
.
get_tokenizer
()
rust_tokenizer
=
self
.
get_rust_tokenizer
()
sequence
=
"UNwant
\u00E9
d,running"
tokens
=
tokenizer
.
tokenize
(
sequence
)
rust_tokens
=
rust_tokenizer
.
tokenize
(
sequence
)
self
.
assertListEqual
(
tokens
,
rust_tokens
)
ids
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
False
)
rust_ids
=
rust_tokenizer
.
encode
(
sequence
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
rust_ids
)
rust_tokenizer
=
self
.
get_rust_tokenizer
()
ids
=
tokenizer
.
encode
(
sequence
)
rust_ids
=
rust_tokenizer
.
encode
(
sequence
)
self
.
assertListEqual
(
ids
,
rust_ids
)
# With lower casing
tokenizer
=
self
.
get_tokenizer
(
do_lower_case
=
True
)
rust_tokenizer
=
self
.
get_rust_tokenizer
(
do_lower_case
=
True
)
sequence
=
"UNwant
\u00E9
d,running"
tokens
=
tokenizer
.
tokenize
(
sequence
)
rust_tokens
=
rust_tokenizer
.
tokenize
(
sequence
)
self
.
assertListEqual
(
tokens
,
rust_tokens
)
ids
=
tokenizer
.
encode
(
sequence
,
add_special_tokens
=
False
)
rust_ids
=
rust_tokenizer
.
encode
(
sequence
,
add_special_tokens
=
False
)
self
.
assertListEqual
(
ids
,
rust_ids
)
rust_tokenizer
=
self
.
get_rust_tokenizer
()
ids
=
tokenizer
.
encode
(
sequence
)
rust_ids
=
rust_tokenizer
.
encode
(
sequence
)
self
.
assertListEqual
(
ids
,
rust_ids
)
def
test_chinese
(
self
):
tokenizer
=
BasicTokenizer
()
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"ah
\u535A\u63A8
zz"
),
[
"ah"
,
"
\u535A
"
,
"
\u63A8
"
,
"zz"
])
def
test_basic_tokenizer_lower
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HeLLo!how
\n
Are yoU? "
),
[
"hello"
,
"!"
,
"how"
,
"are"
,
"you"
,
"?"
]
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"H
\u00E9
llo"
),
[
"hello"
])
def
test_basic_tokenizer_lower_strip_accents_false
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
True
,
strip_accents
=
False
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HäLLo!how
\n
Are yoU? "
),
[
"hällo"
,
"!"
,
"how"
,
"are"
,
"you"
,
"?"
]
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"H
\u00E9
llo"
),
[
"h
\u00E9
llo"
])
def
test_basic_tokenizer_lower_strip_accents_true
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
True
,
strip_accents
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HäLLo!how
\n
Are yoU? "
),
[
"hallo"
,
"!"
,
"how"
,
"are"
,
"you"
,
"?"
]
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"H
\u00E9
llo"
),
[
"hello"
])
def
test_basic_tokenizer_lower_strip_accents_default
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HäLLo!how
\n
Are yoU? "
),
[
"hallo"
,
"!"
,
"how"
,
"are"
,
"you"
,
"?"
]
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"H
\u00E9
llo"
),
[
"hello"
])
def
test_basic_tokenizer_no_lower
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
False
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HeLLo!how
\n
Are yoU? "
),
[
"HeLLo"
,
"!"
,
"how"
,
"Are"
,
"yoU"
,
"?"
]
)
def
test_basic_tokenizer_no_lower_strip_accents_false
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
False
,
strip_accents
=
False
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HäLLo!how
\n
Are yoU? "
),
[
"HäLLo"
,
"!"
,
"how"
,
"Are"
,
"yoU"
,
"?"
]
)
def
test_basic_tokenizer_no_lower_strip_accents_true
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
False
,
strip_accents
=
True
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HäLLo!how
\n
Are yoU? "
),
[
"HaLLo"
,
"!"
,
"how"
,
"Are"
,
"yoU"
,
"?"
]
)
def
test_basic_tokenizer_respects_never_split_tokens
(
self
):
tokenizer
=
BasicTokenizer
(
do_lower_case
=
False
,
never_split
=
[
"[UNK]"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"
\t
HeLLo!how
\n
Are yoU? [UNK]"
),
[
"HeLLo"
,
"!"
,
"how"
,
"Are"
,
"yoU"
,
"?"
,
"[UNK]"
]
)
def
test_wordpiece_tokenizer
(
self
):
vocab_tokens
=
[
"[UNK]"
,
"[CLS]"
,
"[SEP]"
,
"want"
,
"##want"
,
"##ed"
,
"wa"
,
"un"
,
"runn"
,
"##ing"
]
vocab
=
{}
for
(
i
,
token
)
in
enumerate
(
vocab_tokens
):
vocab
[
token
]
=
i
tokenizer
=
WordpieceTokenizer
(
vocab
=
vocab
,
unk_token
=
"[UNK]"
)
self
.
assertListEqual
(
tokenizer
.
tokenize
(
""
),
[])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"unwanted running"
),
[
"un"
,
"##want"
,
"##ed"
,
"runn"
,
"##ing"
])
self
.
assertListEqual
(
tokenizer
.
tokenize
(
"unwantedX running"
),
[
"[UNK]"
,
"runn"
,
"##ing"
])
def
test_is_whitespace
(
self
):
self
.
assertTrue
(
_is_whitespace
(
" "
))
self
.
assertTrue
(
_is_whitespace
(
"
\t
"
))
self
.
assertTrue
(
_is_whitespace
(
"
\r
"
))
self
.
assertTrue
(
_is_whitespace
(
"
\n
"
))
self
.
assertTrue
(
_is_whitespace
(
"
\u00A0
"
))
self
.
assertFalse
(
_is_whitespace
(
"A"
))
self
.
assertFalse
(
_is_whitespace
(
"-"
))
def
test_is_control
(
self
):
self
.
assertTrue
(
_is_control
(
"
\u0005
"
))
self
.
assertFalse
(
_is_control
(
"A"
))
self
.
assertFalse
(
_is_control
(
" "
))
self
.
assertFalse
(
_is_control
(
"
\t
"
))
self
.
assertFalse
(
_is_control
(
"
\r
"
))
def
test_is_punctuation
(
self
):
self
.
assertTrue
(
_is_punctuation
(
"-"
))
self
.
assertTrue
(
_is_punctuation
(
"$"
))
self
.
assertTrue
(
_is_punctuation
(
"`"
))
self
.
assertTrue
(
_is_punctuation
(
"."
))
self
.
assertFalse
(
_is_punctuation
(
"A"
))
self
.
assertFalse
(
_is_punctuation
(
" "
))
def
test_clean_text
(
self
):
tokenizer
=
self
.
get_tokenizer
()
rust_tokenizer
=
self
.
get_rust_tokenizer
()
# Example taken from the issue https://github.com/huggingface/tokenizers/issues/340
self
.
assertListEqual
([
tokenizer
.
tokenize
(
t
)
for
t
in
[
"Test"
,
"
\xad
"
,
"test"
]],
[[
"[UNK]"
],
[],
[
"[UNK]"
]])
self
.
assertListEqual
(
[
rust_tokenizer
.
tokenize
(
t
)
for
t
in
[
"Test"
,
"
\xad
"
,
"test"
]],
[[
"[UNK]"
],
[],
[
"[UNK]"
]]
)
@slow
def
test_sequence_builders
(
self
):
tokenizer
=
self
.
tokenizer_class
.
from_pretrained
(
"bert-base-uncased"
)
text
=
tokenizer
.
encode
(
"sequence builders"
,
add_special_tokens
=
False
)
text_2
=
tokenizer
.
encode
(
"multi-sequence build"
,
add_special_tokens
=
False
)
encoded_sentence
=
tokenizer
.
build_inputs_with_special_tokens
(
text
)
encoded_pair
=
tokenizer
.
build_inputs_with_special_tokens
(
text
,
text_2
)
assert
encoded_sentence
==
[
101
]
+
text
+
[
102
]
assert
encoded_pair
==
[
101
]
+
text
+
[
102
]
+
text_2
+
[
102
]
def
test_offsets_with_special_characters
(
self
):
for
tokenizer
,
pretrained_name
,
kwargs
in
self
.
tokenizers_list
:
with
self
.
subTest
(
"{} ({})"
.
format
(
tokenizer
.
__class__
.
__name__
,
pretrained_name
)):
tokenizer_r
=
self
.
rust_tokenizer_class
.
from_pretrained
(
pretrained_name
,
**
kwargs
)
sentence
=
f
"A, naïve {tokenizer_r.mask_token} AllenNLP sentence."
tokens
=
tokenizer_r
.
encode_plus
(
sentence
,
return_attention_mask
=
False
,
return_token_type_ids
=
False
,
return_offsets_mapping
=
True
,
add_special_tokens
=
True
,
)
do_lower_case
=
tokenizer_r
.
do_lower_case
if
hasattr
(
tokenizer_r
,
"do_lower_case"
)
else
False
expected_results
=
(
[
((
0
,
0
),
tokenizer_r
.
cls_token
),
((
0
,
1
),
"A"
),
((
1
,
2
),
","
),
((
3
,
5
),
"na"
),
((
5
,
6
),
"##ï"
),
((
6
,
8
),
"##ve"
),
((
9
,
15
),
tokenizer_r
.
mask_token
),
((
16
,
21
),
"Allen"
),
((
21
,
23
),
"##NL"
),
((
23
,
24
),
"##P"
),
((
25
,
33
),
"sentence"
),
((
33
,
34
),
"."
),
((
0
,
0
),
tokenizer_r
.
sep_token
),
]
if
not
do_lower_case
else
[
((
0
,
0
),
tokenizer_r
.
cls_token
),
((
0
,
1
),
"a"
),
((
1
,
2
),
","
),
((
3
,
8
),
"naive"
),
((
9
,
15
),
tokenizer_r
.
mask_token
),
((
16
,
21
),
"allen"
),
((
21
,
23
),
"##nl"
),
((
23
,
24
),
"##p"
),
((
25
,
33
),
"sentence"
),
((
33
,
34
),
"."
),
((
0
,
0
),
tokenizer_r
.
sep_token
),
]
)
self
.
assertEqual
(
[
e
[
1
]
for
e
in
expected_results
],
tokenizer_r
.
convert_ids_to_tokens
(
tokens
[
"input_ids"
])
)
self
.
assertEqual
([
e
[
0
]
for
e
in
expected_results
],
tokens
[
"offset_mapping"
])
Event Timeline
Log In to Comment