Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F99208455
PhutilUTF8TestCase.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Wed, Jan 22, 09:53
Size
15 KB
Mime Type
text/x-php
Expires
Fri, Jan 24, 09:53 (2 d)
Engine
blob
Format
Raw Data
Handle
23739094
Attached To
rPHU libphutil
PhutilUTF8TestCase.php
View Options
<?php
/**
* Test cases for functions in utf8.php.
*/
final
class
PhutilUTF8TestCase
extends
PhutilTestCase
{
public
function
testUTF8izeASCIIIgnored
()
{
$input
=
"this
\x
01 is a
\x
7f test string"
;
$this
->
assertEqual
(
$input
,
phutil_utf8ize
(
$input
));
}
public
function
testUTF8izeUTF8Ignored
()
{
$input
=
"
\x
c3
\x
9c
\x
c3
\x
bc
\x
e6
\x
9d
\x
b1!"
;
$this
->
assertEqual
(
$input
,
phutil_utf8ize
(
$input
));
}
public
function
testUTF8izeLongStringNosegfault
()
{
// For some reason my laptop is segfaulting on long inputs inside
// preg_match(). Forestall this craziness in the common case, at least.
phutil_utf8ize
(
str_repeat
(
'x'
,
1024
*
1024
));
$this
->
assertTrue
(
true
);
}
public
function
testUTF8izeInvalidUTF8Fixed
()
{
$input
=
"
\x
c3 this has
\x
e6
\x
9d some invalid utf8
\x
e6"
;
$expect
=
"
\x
EF
\x
BF
\x
BD this has
\x
EF
\x
BF
\x
BD
\x
EF
\x
BF
\x
BD some invalid utf8 "
.
"
\x
EF
\x
BF
\x
BD"
;
$result
=
phutil_utf8ize
(
$input
);
$this
->
assertEqual
(
$expect
,
$result
);
}
public
function
testUTF8izeOwlIsCuteAndFerocious
()
{
// This was once a ferocious owl when we used to use "?" as the replacement
// character instead of U+FFFD, but now he is sort of not as cute or
// ferocious.
$input
=
"M(o
\x
EE
\x
FF
\x
FFo)M"
;
$expect
=
"M(o
\x
EF
\x
BF
\x
BD
\x
EF
\x
BF
\x
BD
\x
EF
\x
BF
\x
BDo)M"
;
$result
=
phutil_utf8ize
(
$input
);
$this
->
assertEqual
(
$expect
,
$result
);
}
public
function
testUTF8len
()
{
$strings
=
array
(
''
=>
0
,
'x'
=>
1
,
"
\x
EF
\x
BF
\x
BD"
=>
1
,
"x
\x
e6
\x
9d
\x
b1y"
=>
3
,
'xyz'
=>
3
,
'quack'
=>
5
,
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8_strlen
(
$str
),
'Length of '
.
$str
);
}
}
public
function
testUTF8v
()
{
$strings
=
array
(
''
=>
array
(),
'x'
=>
array
(
'x'
),
'quack'
=>
array
(
'q'
,
'u'
,
'a'
,
'c'
,
'k'
),
"x
\x
e6
\x
9d
\x
b1y"
=>
array
(
'x'
,
"
\x
e6
\x
9d
\x
b1"
,
'y'
),
// This is a combining character.
"x
\x
CD
\x
A0y"
=>
array
(
'x'
,
"
\x
CD
\x
A0"
,
'y'
),
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8v
(
$str
),
'Vector of '
.
$str
);
}
}
public
function
testUTF8vCodepoints
()
{
$strings
=
array
(
''
=>
array
(),
'x'
=>
array
(
0x78
),
'quack'
=>
array
(
0x71
,
0x75
,
0x61
,
0x63
,
0x6B
),
"x
\x
e6
\x
9d
\x
b1y"
=>
array
(
0x78
,
0x6771
,
0x79
),
"
\x
C2
\x
BB"
=>
array
(
0x00BB
),
"
\x
E2
\x
98
\x
83"
=>
array
(
0x2603
),
"
\x
EF
\x
BF
\x
BF"
=>
array
(
0xFFFF
),
"
\x
F0
\x
9F
\x
92
\x
A9"
=>
array
(
0x1F4A9
),
// This is a combining character.
"x
\x
CD
\x
A0y"
=>
array
(
0x78
,
0x0360
,
0x79
),
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8v_codepoints
(
$str
),
'Codepoint Vector of '
.
$str
);
}
}
public
function
testUTF8ConsoleStrlen
()
{
$strings
=
array
(
''
=>
0
,
"
\0
"
=>
0
,
'x'
=>
1
,
// Double-width chinese character.
"
\x
e6
\x
9d
\x
b1"
=>
2
,
// Combining character.
"x
\x
CD
\x
A0y"
=>
2
,
// Combining plus double-width.
"
\x
e6
\x
9d
\x
b1
\x
CD
\x
A0y"
=>
3
,
// Colors and formatting.
"
\x
1B[1mx
\x
1B[m"
=>
1
,
"
\x
1B[1m
\x
1B[31mx
\x
1B[m"
=>
1
,
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8_console_strlen
(
$str
),
'Console Length of '
.
$str
);
}
}
public
function
testUTF8shorten
()
{
$inputs
=
array
(
array
(
'1erp derp derp'
,
9
,
''
,
'1erp derp'
),
array
(
'2erp derp derp'
,
12
,
'...'
,
'2erp derp...'
),
array
(
'derpxderpxderp'
,
12
,
'...'
,
'derpxderp...'
),
array
(
"derp
\x
E2
\x
99
\x
83derpderp"
,
12
,
'...'
,
"derp
\x
E2
\x
99
\x
83derp..."
),
array
(
''
,
12
,
'...'
,
''
),
array
(
'derp'
,
12
,
'...'
,
'derp'
),
array
(
'11111'
,
5
,
'2222'
,
'11111'
),
array
(
'111111'
,
5
,
'2222'
,
'12222'
),
array
(
'D1rp. Derp derp.'
,
7
,
'...'
,
'D1rp.'
),
// "D2rp." is a better shortening of this, but it's dramatically more
// complicated to implement with the newer byte/glyph/character
// shortening code.
array
(
'D2rp. Derp derp.'
,
5
,
'...'
,
'D2...'
),
array
(
'D3rp. Derp derp.'
,
4
,
'...'
,
'D...'
),
array
(
'D4rp. Derp derp.'
,
14
,
'...'
,
'D4rp. Derp...'
),
array
(
'D5rpderp, derp derp'
,
16
,
'...'
,
'D5rpderp...'
),
array
(
'D6rpderp, derp derp'
,
17
,
'...'
,
'D6rpderp, derp...'
),
// Strings with combining characters.
array
(
"Gr
\x
CD
\x
A0mpyCatSmiles"
,
8
,
'...'
,
"Gr
\x
CD
\x
A0mpy..."
),
array
(
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0Y"
,
1
,
''
,
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0"
),
// This behavior is maybe a little bad, but it seems mostly reasonable,
// at least for latin languages.
array
(
'Derp, supercalafragalisticexpialadoshus'
,
30
,
'...'
,
'Derp...'
,
),
// If a string has only word-break characters in it, we should just cut
// it, not produce only the terminal.
array
(
'(((((((((('
,
8
,
'...'
,
'(((((...'
),
// Terminal is longer than requested input.
array
(
'derp'
,
3
,
'quack'
,
'quack'
),
);
foreach
(
$inputs
as
$input
)
{
list
(
$string
,
$length
,
$terminal
,
$expect
)
=
$input
;
$result
=
id
(
new
PhutilUTF8StringTruncator
())
->
setMaximumGlyphs
(
$length
)
->
setTerminator
(
$terminal
)
->
truncateString
(
$string
);
$this
->
assertEqual
(
$expect
,
$result
,
'Shortening of '
.
$string
);
}
}
public
function
testUTF8StringTruncator
()
{
$cases
=
array
(
array
(
"o
\x
CD
\x
A0o
\x
CD
\x
A0o
\x
CD
\x
A0o
\x
CD
\x
A0o
\x
CD
\x
A0"
,
6
,
"o
\x
CD
\x
A0!"
,
6
,
"o
\x
CD
\x
A0o
\x
CD
\x
A0!"
,
6
,
"o
\x
CD
\x
A0o
\x
CD
\x
A0o
\x
CD
\x
A0o
\x
CD
\x
A0o
\x
CD
\x
A0"
,
),
array
(
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0Y"
,
6
,
'!'
,
6
,
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0Y"
,
6
,
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0Y"
,
),
array
(
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0YZ"
,
6
,
'!'
,
5
,
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0!"
,
2
,
"X
\x
CD
\x
A0
\x
CD
\x
A0
\x
CD
\x
A0!"
,
),
array
(
"
\x
E2
\x
98
\x
83
\x
E2
\x
98
\x
83
\x
E2
\x
98
\x
83
\x
E2
\x
98
\x
83"
,
4
,
"
\x
E2
\x
98
\x
83!"
,
3
,
"
\x
E2
\x
98
\x
83
\x
E2
\x
98
\x
83!"
,
3
,
"
\x
E2
\x
98
\x
83
\x
E2
\x
98
\x
83!"
,
),
);
foreach
(
$cases
as
$case
)
{
list
(
$input
,
$b_len
,
$b_out
,
$p_len
,
$p_out
,
$g_len
,
$g_out
)
=
$case
;
$result
=
id
(
new
PhutilUTF8StringTruncator
())
->
setMaximumBytes
(
$b_len
)
->
setTerminator
(
'!'
)
->
truncateString
(
$input
);
$this
->
assertEqual
(
$b_out
,
$result
,
'byte-short of '
.
$input
);
$result
=
id
(
new
PhutilUTF8StringTruncator
())
->
setMaximumCodepoints
(
$p_len
)
->
setTerminator
(
'!'
)
->
truncateString
(
$input
);
$this
->
assertEqual
(
$p_out
,
$result
,
'codepoint-short of '
.
$input
);
$result
=
id
(
new
PhutilUTF8StringTruncator
())
->
setMaximumGlyphs
(
$g_len
)
->
setTerminator
(
'!'
)
->
truncateString
(
$input
);
$this
->
assertEqual
(
$g_out
,
$result
,
'glyph-short of '
.
$input
);
}
}
public
function
testUTF8Wrap
()
{
$inputs
=
array
(
array
(
'aaaaaaa'
,
3
,
array
(
'aaa'
,
'aaa'
,
'a'
,
),
),
array
(
'aa<b>aaaaa'
,
3
,
array
(
'aa<b>a'
,
'aaa'
,
'a'
,
),
),
array
(
'aa&aaaa'
,
3
,
array
(
'aa&'
,
'aaa'
,
'a'
,
),
),
array
(
"aa
\x
e6
\x
9d
\x
b1aaaa"
,
3
,
array
(
"aa
\x
e6
\x
9d
\x
b1"
,
'aaa'
,
'a'
,
),
),
array
(
''
,
80
,
array
(
),
),
array
(
'a'
,
80
,
array
(
'a'
,
),
),
);
foreach
(
$inputs
as
$input
)
{
list
(
$string
,
$width
,
$expect
)
=
$input
;
$this
->
assertEqual
(
$expect
,
phutil_utf8_hard_wrap_html
(
$string
,
$width
),
"Wrapping of '"
.
$string
.
"'"
);
}
}
public
function
testUTF8NonHTMLWrap
()
{
$inputs
=
array
(
array
(
'aaaaaaa'
,
3
,
array
(
'aaa'
,
'aaa'
,
'a'
,
),
),
array
(
'abracadabra!'
,
4
,
array
(
'abra'
,
'cada'
,
'bra!'
,
),
),
array
(
''
,
10
,
array
(
),
),
array
(
'a'
,
20
,
array
(
'a'
,
),
),
array
(
"aa
\x
e6
\x
9d
\x
b1aaaa"
,
3
,
array
(
"aa
\x
e6
\x
9d
\x
b1"
,
'aaa'
,
'a'
,
),
),
array
(
"mmm
\n
mmm
\n
mmmm"
,
3
,
array
(
'mmm'
,
'mmm'
,
'mmm'
,
'm'
,
),
),
);
foreach
(
$inputs
as
$input
)
{
list
(
$string
,
$width
,
$expect
)
=
$input
;
$this
->
assertEqual
(
$expect
,
phutil_utf8_hard_wrap
(
$string
,
$width
),
"Wrapping of '"
.
$string
.
"'"
);
}
}
public
function
testUTF8ConvertParams
()
{
$caught
=
null
;
try
{
phutil_utf8_convert
(
''
,
'utf8'
,
''
);
}
catch
(
Exception
$ex
)
{
$caught
=
$ex
;
}
$this
->
assertTrue
((
bool
)
$caught
,
'Requires source encoding.'
);
$caught
=
null
;
try
{
phutil_utf8_convert
(
''
,
''
,
'utf8'
);
}
catch
(
Exception
$ex
)
{
$caught
=
$ex
;
}
$this
->
assertTrue
((
bool
)
$caught
,
'Requires target encoding.'
);
}
public
function
testUTF8Convert
()
{
if
(!
function_exists
(
'mb_convert_encoding'
))
{
$this
->
assertSkipped
(
'Requires mbstring extension.'
);
}
// "[ae]gis se[n]or [(c)] 1970 [+/-] 1 [degree]"
$input
=
"
\x
E6gis SE
\x
D1OR
\x
A9 1970
\x
B11
\x
B0"
;
$expect
=
"
\x
C3
\x
A6gis SE
\x
C3
\x
91OR
\x
C2
\x
A9 1970
\x
C2
\x
B11
\x
C2
\x
B0"
;
$output
=
phutil_utf8_convert
(
$input
,
'UTF-8'
,
'ISO-8859-1'
);
$this
->
assertEqual
(
$expect
,
$output
,
'Conversion from ISO-8859-1.'
);
$caught
=
null
;
try
{
phutil_utf8_convert
(
'xyz'
,
'moon language'
,
'UTF-8'
);
}
catch
(
Exception
$ex
)
{
$caught
=
$ex
;
}
$this
->
assertTrue
((
bool
)
$caught
,
'Conversion with bogus encoding.'
);
}
public
function
testUTF8ucwords
()
{
$tests
=
array
(
''
=>
''
,
'x'
=>
'X'
,
'X'
=>
'X'
,
'five short graybles'
=>
'Five Short Graybles'
,
'xXxSNiPeRKiLLeRxXx'
=>
'XXxSNiPeRKiLLeRxXx'
,
);
foreach
(
$tests
as
$input
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8_ucwords
(
$input
),
'phutil_utf8_ucwords("'
.
$input
.
'")'
);
}
}
public
function
testUTF8strtolower
()
{
$tests
=
array
(
''
=>
''
,
'a'
=>
'a'
,
'A'
=>
'a'
,
'!'
=>
'!'
,
'OMG!~ LOLolol ROFLwaffle11~'
=>
'omg!~ lololol roflwaffle11~'
,
"
\x
E2
\x
98
\x
83"
=>
"
\x
E2
\x
98
\x
83"
,
);
foreach
(
$tests
as
$input
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8_strtolower
(
$input
),
'phutil_utf8_strtolower("'
.
$input
.
'")'
);
}
}
public
function
testUTF8strtoupper
()
{
$tests
=
array
(
''
=>
''
,
'a'
=>
'A'
,
'A'
=>
'A'
,
'!'
=>
'!'
,
'Cats have 9 lives.'
=>
'CATS HAVE 9 LIVES.'
,
"
\x
E2
\x
98
\x
83"
=>
"
\x
E2
\x
98
\x
83"
,
);
foreach
(
$tests
as
$input
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8_strtoupper
(
$input
),
'phutil_utf8_strtoupper("'
.
$input
.
'")'
);
}
}
public
function
testUTF8IsCombiningCharacter
()
{
$character
=
"
\x
CD
\x
A0"
;
$this
->
assertEqual
(
true
,
phutil_utf8_is_combining_character
(
$character
));
$character
=
'a'
;
$this
->
assertEqual
(
false
,
phutil_utf8_is_combining_character
(
$character
));
}
public
function
testUTF8vCombined
()
{
// Empty string.
$string
=
''
;
$this
->
assertEqual
(
array
(),
phutil_utf8v_combined
(
$string
));
// Single character.
$string
=
'x'
;
$this
->
assertEqual
(
array
(
'x'
),
phutil_utf8v_combined
(
$string
));
// No combining characters.
$string
=
'cat'
;
$this
->
assertEqual
(
array
(
'c'
,
'a'
,
't'
),
phutil_utf8v_combined
(
$string
));
// String with a combining character in the middle.
$string
=
"ca
\x
CD
\x
A0t"
;
$this
->
assertEqual
(
array
(
'c'
,
"a
\x
CD
\x
A0"
,
't'
),
phutil_utf8v_combined
(
$string
));
// String starting with a combined character.
$string
=
"c
\x
CD
\x
A0at"
;
$this
->
assertEqual
(
array
(
"c
\x
CD
\x
A0"
,
'a'
,
't'
),
phutil_utf8v_combined
(
$string
));
// String with trailing combining character.
$string
=
"cat
\x
CD
\x
A0"
;
$this
->
assertEqual
(
array
(
'c'
,
'a'
,
"t
\x
CD
\x
A0"
),
phutil_utf8v_combined
(
$string
));
// String with muliple combined characters.
$string
=
"c
\x
CD
\x
A0a
\x
CD
\x
A0t
\x
CD
\x
A0"
;
$this
->
assertEqual
(
array
(
"c
\x
CD
\x
A0"
,
"a
\x
CD
\x
A0"
,
"t
\x
CD
\x
A0"
),
phutil_utf8v_combined
(
$string
));
// String with multiple combining characters.
$string
=
"ca
\x
CD
\x
A0
\x
CD
\x
A0t"
;
$this
->
assertEqual
(
array
(
'c'
,
"a
\x
CD
\x
A0
\x
CD
\x
A0"
,
't'
),
phutil_utf8v_combined
(
$string
));
// String beginning with a combining character.
$string
=
"
\x
CD
\x
A0
\x
CD
\x
A0c"
;
$this
->
assertEqual
(
array
(
"
\x
CD
\x
A0
\x
CD
\x
A0"
,
'c'
),
phutil_utf8v_combined
(
$string
));
}
public
function
testUTF8BMPSegfaults
()
{
// This test case fails by segfaulting, or passes by not segfaulting. See
// the function implementation for details.
$input
=
str_repeat
(
"
\x
EF
\x
BF
\x
BF"
,
1024
*
32
);
phutil_is_utf8_with_only_bmp_characters
(
$input
);
$this
->
assertTrue
(
true
);
}
public
function
testUTF8BMP
()
{
$tests
=
array
(
''
=>
array
(
true
,
true
,
'empty string'
),
'a'
=>
array
(
true
,
true
,
'a'
),
"a
\x
CD
\x
A0
\x
CD
\x
A0"
=>
array
(
true
,
true
,
'a with combining'
),
"
\x
E2
\x
98
\x
83"
=>
array
(
true
,
true
,
'snowman'
),
// This is the last character in BMP, U+FFFF.
"
\x
EF
\x
BF
\x
BF"
=>
array
(
true
,
true
,
'U+FFFF'
),
// This isn't valid.
"
\x
EF
\x
BF
\x
C0"
=>
array
(
false
,
false
,
'Invalid, byte range.'
),
// This is an invalid nonminimal representation.
"
\x
F0
\x
81
\x
80
\x
80"
=>
array
(
false
,
false
,
'Nonminimal 4-byte characer.'
),
// This is the first character above BMP, U+10000.
"
\x
F0
\x
90
\x
80
\x
80"
=>
array
(
true
,
false
,
'U+10000'
),
"
\x
F0
\x
9D
\x
84
\x
9E"
=>
array
(
true
,
false
,
'gclef'
),
"musical
\x
F0
\x
9D
\x
84
\x
9E g-clef"
=>
array
(
true
,
false
,
'gclef text'
),
"
\x
F0
\x
9D
\x
84"
=>
array
(
false
,
false
,
'Invalid, truncated.'
),
"
\x
E0
\x
80
\x
80"
=>
array
(
false
,
false
,
'Nonminimal 3-byte character.'
),
// Partial BMP characters.
"
\x
CD"
=>
array
(
false
,
false
,
'Partial 2-byte character.'
),
"
\x
E0
\x
A0"
=>
array
(
false
,
false
,
'Partial BMP 0xE0 character.'
),
"
\x
E2
\x
98"
=>
array
(
false
,
false
,
'Partial BMP cahracter.'
),
);
foreach
(
$tests
as
$input
=>
$test
)
{
list
(
$expect_utf8
,
$expect_bmp
,
$test_name
)
=
$test
;
// Depending on what's installed on the system, this may use an
// extension.
$this
->
assertEqual
(
$expect_utf8
,
phutil_is_utf8
(
$input
),
pht
(
'is_utf(%s)'
,
$test_name
));
// Also test this against the pure PHP implementation, explicitly.
$this
->
assertEqual
(
$expect_utf8
,
phutil_is_utf8_slowly
(
$input
),
pht
(
'is_utf_slowly(%s)'
,
$test_name
));
$this
->
assertEqual
(
$expect_bmp
,
phutil_is_utf8_with_only_bmp_characters
(
$input
),
pht
(
'is_utf_bmp(%s)'
,
$test_name
));
}
}
}
Event Timeline
Log In to Comment