Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F90899397
utf8.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Nov 5, 19:09
Size
7 KB
Mime Type
text/x-php
Expires
Thu, Nov 7, 19:09 (2 d)
Engine
blob
Format
Raw Data
Handle
22155419
Attached To
rPHU libphutil
utf8.php
View Options
<?php
/*
* Copyright 2011 Facebook, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Convert a string into valid UTF-8. This function is quite slow.
*
* When invalid byte subsequences are encountered, they will be replaced with
* U+FFFD, the Unicode replacement character.
*
* @param string String to convert to valid UTF-8.
* @return string String with invalid UTF-8 byte subsequences replaced with
* U+FFFD.
* @group utf8
*/
function
phutil_utf8ize
(
$string
)
{
if
(
phutil_is_utf8
(
$string
))
{
return
$string
;
}
// There is no function to do this in iconv, mbstring or ICU to do this, so
// do it (very very slowly) in pure PHP.
// TODO: Provide an optional fast C implementation ala fb_utf8ize() if this
// ever shows up in profiles?
$result
=
array
();
$regex
=
"/([
\x
01-
\x
7F]"
.
"|[
\x
C2-
\x
DF][
\x
80-
\x
BF]"
.
"|[
\x
E0-
\x
EF][
\x
80-
\x
BF][
\x
80-
\x
BF]"
.
"|[
\x
F0-
\x
F4][
\x
80-
\x
BF][
\x
80-
\x
BF][
\x
80-
\x
BF])"
.
"|(.)/"
;
$offset
=
0
;
$matches
=
null
;
while
(
preg_match
(
$regex
,
$string
,
$matches
,
0
,
$offset
))
{
if
(!
isset
(
$matches
[
2
]))
{
$result
[]
=
$matches
[
1
];
}
else
{
// Unicode replacement character, U+FFFD.
$result
[]
=
"
\x
EF
\x
BF
\x
BD"
;
}
$offset
+=
strlen
(
$matches
[
0
]);
}
return
implode
(
''
,
$result
);
}
/**
* Determine if a string is valid UTF-8.
*
* @param string Some string which may or may not be valid UTF-8.
* @return bool True if the string is valid UTF-8.
* @group utf8
*/
function
phutil_is_utf8
(
$string
)
{
if
(
function_exists
(
'mb_check_encoding'
))
{
// If mbstring is available, this is significantly faster than using PHP
// regexps.
return
mb_check_encoding
(
$string
,
'UTF-8'
);
}
$regex
=
"/^("
.
"[
\x
01-
\x
7F]+"
.
"|([
\x
C2-
\x
DF][
\x
80-
\x
BF])"
.
"|([
\x
E0-
\x
EF][
\x
80-
\x
BF][
\x
80-
\x
BF])"
.
"|([
\x
F0-
\x
F4][
\x
80-
\x
BF][
\x
80-
\x
BF][
\x
80-
\x
BF]))*
\$
/"
;
return
preg_match
(
$regex
,
$string
);
}
/**
* Find the character length of a UTF-8 string.
*
* @param string A valid utf-8 string.
* @return int The character length of the string.
* @group utf8
*/
function
phutil_utf8_strlen
(
$string
)
{
if
(
function_exists
(
'mb_strlen'
))
{
return
mb_strlen
(
$string
,
'UTF-8'
);
}
// TODO: This is terrifically slow.
return
count
(
phutil_utf8v
(
$string
));
}
/**
* Split a UTF-8 string into an array of characters.
*
* NOTE: This function does not deal properly with combining characters.
*
* @param string A valid utf-8 string.
* @return list A list of characters in the string.
* @group utf8
*/
function
phutil_utf8v
(
$string
)
{
$res
=
array
();
$len
=
strlen
(
$string
);
$ii
=
0
;
while
(
$ii
<
$len
)
{
$byte
=
$string
[
$ii
];
if
(
$byte
<=
"
\x
7F"
)
{
$res
[]
=
$byte
;
$ii
+=
1
;
continue
;
}
else
if
(
$byte
<
"
\x
C0"
)
{
throw
new
Exception
(
"Invalid UTF-8 string passed to phutil_utf8v()."
);
}
else
if
(
$byte
<=
"
\x
DF"
)
{
$seq_len
=
2
;
}
else
if
(
$byte
<=
"
\x
EF"
)
{
$seq_len
=
3
;
}
else
if
(
$byte
<=
"
\x
F7"
)
{
$seq_len
=
4
;
}
else
if
(
$byte
<=
"
\x
FB"
)
{
$seq_len
=
5
;
}
else
if
(
$byte
<=
"
\x
FD"
)
{
$seq_len
=
6
;
}
else
{
throw
new
Exception
(
"Invalid UTF-8 string passed to phutil_utf8v()."
);
}
if
(
$ii
+
$seq_len
>
$len
)
{
throw
new
Exception
(
"Invalid UTF-8 string passed to phutil_utf8v()."
);
}
for
(
$jj
=
1
;
$jj
<
$seq_len
;
++
$jj
)
{
if
(
$string
[
$ii
+
$jj
]
>=
"
\x
C0"
)
{
throw
new
Exception
(
"Invalid UTF-8 string passed to phutil_utf8v()."
);
}
}
$res
[]
=
substr
(
$string
,
$ii
,
$seq_len
);
$ii
+=
$seq_len
;
}
return
$res
;
}
/**
* Shorten a string to provide a summary, respecting UTF-8 characters. This
* function attempts to truncate strings at word boundaries.
*
* NOTE: This function makes a best effort to apply some reasonable rules but
* will not work well for the full range of unicode languages. For instance,
* no effort is made to deal with combining characters.
*
* @param string UTF-8 string to shorten.
* @param int Maximum length of the result.
* @param string If the string is shortened, add this at the end. Defaults to
* horizontal ellipsis.
* @return string A string with no more than the specified character length.
*/
function
phutil_utf8_shorten
(
$string
,
$length
,
$terminal
=
"
\x
E2
\x
80
\x
A6"
)
{
$terminal_len
=
count
(
phutil_utf8v
(
$terminal
));
if
(
$terminal_len
>=
$length
)
{
// If you provide a terminal we still enforce that the result (including
// the terminal) is no longer than $length, but we can't do that if the
// terminal is too long.
throw
new
Exception
(
"String terminal length must be less than string length!"
);
}
$string_v
=
phutil_utf8v
(
$string
);
$string_len
=
count
(
$string_v
);
if
(
$string_len
<=
$length
)
{
// If the string is already shorter than the requested length, simply return
// it unmodified.
return
$string
;
}
// NOTE: This is not complete, and there are many other word boundary
// characters and reasonable places to break words in the UTF-8 character
// space. For now, this gives us reasonable behavior for latin langauges. We
// don't necessarily have access to PCRE+Unicode so there isn't a great way
// for us to look up character attributes.
// If we encounter these, prefer to break on them instead of cutting the
// string off in the middle of a word.
static
$break_characters
=
array
(
' '
=>
true
,
"
\n
"
=>
true
,
';'
=>
true
,
':'
=>
true
,
'['
=>
true
,
'('
=>
true
,
','
=>
true
,
'-'
=>
true
,
);
// If we encounter these, shorten to this character exactly without appending
// the terminal.
static
$stop_characters
=
array
(
'.'
=>
true
,
'!'
=>
true
,
'?'
=>
true
,
);
// Search backward in the string, looking for reasonable places to break it.
$word_boundary
=
null
;
$stop_boundary
=
null
;
// If we do a word break with a terminal, we have to look beyond at least the
// number of characters in the terminal.
$terminal_area
=
$length
-
$terminal_len
;
for
(
$ii
=
$length
;
$ii
>=
0
;
$ii
--)
{
$c
=
$string_v
[
$ii
];
if
(
isset
(
$break_characters
[
$c
])
&&
(
$ii
<=
$terminal_area
))
{
$word_boundary
=
$ii
;
}
else
if
(
isset
(
$stop_characters
[
$c
])
&&
(
$ii
<
$length
))
{
$stop_boundary
=
$ii
+
1
;
break
;
}
else
{
if
(
$word_boundary
!==
null
)
{
break
;
}
}
}
if
(
$stop_boundary
!==
null
)
{
// We found a character like ".". Cut the string there, without appending
// the terminal.
$string_part
=
array_slice
(
$string_v
,
0
,
$stop_boundary
);
return
implode
(
''
,
$string_part
);
}
// If we didn't find any boundary characters or we found ONLY boundary
// characters, just break at the maximum character length.
if
(
$word_boundary
===
null
||
$word_boundary
===
0
)
{
$word_boundary
=
$length
-
$terminal_len
;
}
$string_part
=
array_slice
(
$string_v
,
0
,
$word_boundary
);
$string_part
=
implode
(
''
,
$string_part
);
return
$string_part
.
$terminal
;
}
Event Timeline
Log In to Comment