Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F102662263
PhutilUTF8TestCase.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Sun, Feb 23, 00:09
Size
7 KB
Mime Type
text/x-php
Expires
Tue, Feb 25, 00:09 (2 d)
Engine
blob
Format
Raw Data
Handle
24385901
Attached To
rPHU libphutil
PhutilUTF8TestCase.php
View Options
<?php
/**
* Test cases for functions in utf8.php.
*
* @group testcase
*/
final
class
PhutilUTF8TestCase
extends
PhutilTestCase
{
public
function
testUTF8ize_ASCII_ignored
()
{
$input
=
"this
\x
01 is a
\x
7f test string"
;
$this
->
assertEqual
(
$input
,
phutil_utf8ize
(
$input
));
}
public
function
testUTF8ize_UTF8_ignored
()
{
$input
=
"
\x
c3
\x
9c
\x
c3
\x
bc
\x
e6
\x
9d
\x
b1!"
;
$this
->
assertEqual
(
$input
,
phutil_utf8ize
(
$input
));
}
public
function
testUTF8ize_LongString_nosegfault
()
{
// For some reason my laptop is segfaulting on long inputs inside
// preg_match(). Forestall this craziness in the common case, at least.
phutil_utf8ize
(
str_repeat
(
'x'
,
1024
*
1024
));
$this
->
assertEqual
(
true
,
true
);
}
public
function
testUTF8ize_invalidUTF8_fixed
()
{
$input
=
"
\x
c3 this has
\x
e6
\x
9d some invalid utf8
\x
e6"
;
$expect
=
"
\x
EF
\x
BF
\x
BD this has
\x
EF
\x
BF
\x
BD
\x
EF
\x
BF
\x
BD some invalid utf8 "
.
"
\x
EF
\x
BF
\x
BD"
;
$result
=
phutil_utf8ize
(
$input
);
$this
->
assertEqual
(
$expect
,
$result
);
}
public
function
testUTF8ize_owl_isCuteAndFerocious
()
{
// This was once a ferocious owl when we used to use "?" as the replacement
// character instead of U+FFFD, but now he is sort of not as cute or
// ferocious.
$input
=
"M(o
\x
EE
\x
FF
\x
FFo)M"
;
$expect
=
"M(o
\x
EF
\x
BF
\x
BD
\x
EF
\x
BF
\x
BD
\x
EF
\x
BF
\x
BDo)M"
;
$result
=
phutil_utf8ize
(
$input
);
$this
->
assertEqual
(
$expect
,
$result
);
}
public
function
testUTF8len
()
{
$strings
=
array
(
''
=>
0
,
'x'
=>
1
,
"
\x
EF
\x
BF
\x
BD"
=>
1
,
"x
\x
e6
\x
9d
\x
b1y"
=>
3
,
"xyz"
=>
3
,
'quack'
=>
5
,
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8_strlen
(
$str
),
'Length of '
.
$str
);
}
}
public
function
testUTF8v
()
{
$strings
=
array
(
''
=>
array
(),
'x'
=>
array
(
'x'
),
'quack'
=>
array
(
'q'
,
'u'
,
'a'
,
'c'
,
'k'
),
"x
\x
e6
\x
9d
\x
b1y"
=>
array
(
'x'
,
"
\x
e6
\x
9d
\x
b1"
,
'y'
),
// This is a combining character.
"x
\x
CD
\x
A0y"
=>
array
(
"x"
,
"
\x
CD
\x
A0"
,
'y'
),
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8v
(
$str
),
'Vector of '
.
$str
);
}
}
public
function
testUTF8vCodepoints
()
{
$strings
=
array
(
''
=>
array
(),
'x'
=>
array
(
0x78
),
'quack'
=>
array
(
0x71
,
0x75
,
0x61
,
0x63
,
0x6B
),
"x
\x
e6
\x
9d
\x
b1y"
=>
array
(
0x78
,
0x6771
,
0x79
),
"
\x
C2
\x
BB"
=>
array
(
0x00BB
),
"
\x
E2
\x
98
\x
83"
=>
array
(
0x2603
),
"
\x
EF
\x
BF
\x
BF"
=>
array
(
0xFFFF
),
"
\x
F0
\x
9F
\x
92
\x
A9"
=>
array
(
0x1F4A9
),
// This is a combining character.
"x
\x
CD
\x
A0y"
=>
array
(
0x78
,
0x0360
,
0x79
),
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8v_codepoints
(
$str
),
'Codepoint Vector of '
.
$str
);
}
}
public
function
testUTF8ConsoleStrlen
()
{
$strings
=
array
(
""
=>
0
,
"
\0
"
=>
0
,
"x"
=>
1
,
// Double-width chinese character.
"
\x
e6
\x
9d
\x
b1"
=>
2
,
);
foreach
(
$strings
as
$str
=>
$expect
)
{
$this
->
assertEqual
(
$expect
,
phutil_utf8_console_strlen
(
$str
),
'Console Length of '
.
$str
);
}
}
public
function
testUTF8shorten
()
{
$inputs
=
array
(
array
(
"1erp derp derp"
,
9
,
""
,
"1erp derp"
),
array
(
"2erp derp derp"
,
12
,
"..."
,
"2erp derp..."
),
array
(
"derpxderpxderp"
,
12
,
"..."
,
"derpxderp..."
),
array
(
"derp
\x
E2
\x
99
\x
83derpderp"
,
12
,
"..."
,
"derp
\x
E2
\x
99
\x
83derp..."
),
array
(
""
,
12
,
"..."
,
""
),
array
(
"derp"
,
12
,
"..."
,
"derp"
),
array
(
"11111"
,
5
,
"2222"
,
"11111"
),
array
(
"111111"
,
5
,
"2222"
,
"12222"
),
array
(
"D1rp. Derp derp."
,
7
,
"..."
,
"D1rp."
),
array
(
"D2rp. Derp derp."
,
5
,
"..."
,
"D2rp."
),
array
(
"D3rp. Derp derp."
,
4
,
"..."
,
"D..."
),
array
(
"D4rp. Derp derp."
,
14
,
"..."
,
"D4rp. Derp..."
),
array
(
"D5rpderp, derp derp"
,
16
,
"..."
,
"D5rpderp..."
),
array
(
"D6rpderp, derp derp"
,
17
,
"..."
,
"D6rpderp, derp..."
),
// This behavior is maybe a little bad, but it seems mostly reasonable,
// at least for latin languages.
array
(
"Derp, supercalafragalisticexpialadoshus"
,
30
,
"..."
,
"Derp..."
),
// If a string has only word-break characters in it, we should just cut
// it, not produce only the terminal.
array
(
"(((((((((("
,
8
,
'...'
,
'(((((...'
),
);
foreach
(
$inputs
as
$input
)
{
list
(
$string
,
$length
,
$terminal
,
$expect
)
=
$input
;
$result
=
phutil_utf8_shorten
(
$string
,
$length
,
$terminal
);
$this
->
assertEqual
(
$expect
,
$result
,
'Shortening of '
.
$string
);
}
try
{
phutil_utf8_shorten
(
'derp'
,
3
,
'quack'
);
$caught
=
false
;
}
catch
(
Exception
$ex
)
{
$caught
=
true
;
}
$this
->
assertEqual
(
true
,
$caught
,
'Expect exception for terminal.'
);
}
public
function
testUTF8Wrap
()
{
$inputs
=
array
(
array
(
'aaaaaaa'
,
3
,
array
(
'aaa'
,
'aaa'
,
'a'
,
)),
array
(
'aa<b>aaaaa'
,
3
,
array
(
'aa<b>a'
,
'aaa'
,
'a'
,
)),
array
(
'aa&aaaa'
,
3
,
array
(
'aa&'
,
'aaa'
,
'a'
,
)),
array
(
"aa
\x
e6
\x
9d
\x
b1aaaa"
,
3
,
array
(
"aa
\x
e6
\x
9d
\x
b1"
,
'aaa'
,
'a'
,
)),
array
(
''
,
80
,
array
(
)),
array
(
'a'
,
80
,
array
(
'a'
,
)),
);
foreach
(
$inputs
as
$input
)
{
list
(
$string
,
$width
,
$expect
)
=
$input
;
$this
->
assertEqual
(
$expect
,
phutil_utf8_hard_wrap_html
(
$string
,
$width
),
"Wrapping of '"
.
$string
.
"'"
);
}
}
public
function
testUTF8ConvertParams
()
{
$caught
=
null
;
try
{
phutil_utf8_convert
(
''
,
'utf8'
,
''
);
}
catch
(
Exception
$ex
)
{
$caught
=
$ex
;
}
$this
->
assertEqual
(
true
,
(
bool
)
$caught
,
'Requires source encoding.'
);
$caught
=
null
;
try
{
phutil_utf8_convert
(
''
,
''
,
'utf8'
);
}
catch
(
Exception
$ex
)
{
$caught
=
$ex
;
}
$this
->
assertEqual
(
true
,
(
bool
)
$caught
,
'Requires target encoding.'
);
}
public
function
testUTF8Convert
()
{
if
(!
function_exists
(
'mb_convert_encoding'
))
{
$this
->
assertSkipped
(
"Requires mbstring extension."
);
}
// "[ae]gis se[n]or [(c)] 1970 [+/-] 1 [degree]"
$input
=
"
\x
E6gis SE
\x
D1OR
\x
A9 1970
\x
B11
\x
B0"
;
$expect
=
"
\x
C3
\x
A6gis SE
\x
C3
\x
91OR
\x
C2
\x
A9 1970
\x
C2
\x
B11
\x
C2
\x
B0"
;
$output
=
phutil_utf8_convert
(
$input
,
'UTF-8'
,
'ISO-8859-1'
);
$this
->
assertEqual
(
$expect
,
$output
,
'Conversion from ISO-8859-1.'
);
$caught
=
null
;
try
{
phutil_utf8_convert
(
'xyz'
,
'moon language'
,
'UTF-8'
);
}
catch
(
Exception
$ex
)
{
$caught
=
$ex
;
}
$this
->
assertEqual
(
true
,
(
bool
)
$caught
,
'Conversion with bogus encoding.'
);
}
}
Event Timeline
Log In to Comment