diff --git a/src/utils/__tests__/PhutilUTF8TestCase.php b/src/utils/__tests__/PhutilUTF8TestCase.php index f7e0656..46db49f 100644 --- a/src/utils/__tests__/PhutilUTF8TestCase.php +++ b/src/utils/__tests__/PhutilUTF8TestCase.php @@ -1,384 +1,434 @@ assertEqual($input, phutil_utf8ize($input)); } public function testUTF8izeUTF8Ignored() { $input = "\xc3\x9c \xc3\xbc \xe6\x9d\xb1!"; $this->assertEqual($input, phutil_utf8ize($input)); } public function testUTF8izeLongStringNosegfault() { // For some reason my laptop is segfaulting on long inputs inside // preg_match(). Forestall this craziness in the common case, at least. phutil_utf8ize(str_repeat('x', 1024 * 1024)); $this->assertEqual(true, true); } public function testUTF8izeInvalidUTF8Fixed() { $input = "\xc3 this has \xe6\x9d some invalid utf8 \xe6"; $expect = "\xEF\xBF\xBD this has \xEF\xBF\xBD\xEF\xBF\xBD some invalid utf8 ". "\xEF\xBF\xBD"; $result = phutil_utf8ize($input); $this->assertEqual($expect, $result); } public function testUTF8izeOwlIsCuteAndFerocious() { // This was once a ferocious owl when we used to use "?" as the replacement // character instead of U+FFFD, but now he is sort of not as cute or // ferocious. $input = "M(o\xEE\xFF\xFFo)M"; $expect = "M(o\xEF\xBF\xBD\xEF\xBF\xBD\xEF\xBF\xBDo)M"; $result = phutil_utf8ize($input); $this->assertEqual($expect, $result); } public function testUTF8len() { $strings = array( '' => 0, 'x' => 1, "\xEF\xBF\xBD" => 1, "x\xe6\x9d\xb1y" => 3, "xyz" => 3, 'quack' => 5, ); foreach ($strings as $str => $expect) { $this->assertEqual($expect, phutil_utf8_strlen($str), 'Length of '.$str); } } public function testUTF8v() { $strings = array( '' => array(), 'x' => array('x'), 'quack' => array('q', 'u', 'a', 'c', 'k'), "x\xe6\x9d\xb1y" => array('x', "\xe6\x9d\xb1", 'y'), // This is a combining character. "x\xCD\xA0y" => array("x", "\xCD\xA0", 'y'), ); foreach ($strings as $str => $expect) { $this->assertEqual($expect, phutil_utf8v($str), 'Vector of '.$str); } } public function testUTF8vCodepoints() { $strings = array( '' => array(), 'x' => array(0x78), 'quack' => array(0x71, 0x75, 0x61, 0x63, 0x6B), "x\xe6\x9d\xb1y" => array(0x78, 0x6771, 0x79), "\xC2\xBB" => array(0x00BB), "\xE2\x98\x83" => array(0x2603), "\xEF\xBF\xBF" => array(0xFFFF), "\xF0\x9F\x92\xA9" => array(0x1F4A9), // This is a combining character. "x\xCD\xA0y" => array(0x78, 0x0360, 0x79), ); foreach ($strings as $str => $expect) { $this->assertEqual( $expect, phutil_utf8v_codepoints($str), 'Codepoint Vector of '.$str); } } public function testUTF8ConsoleStrlen() { $strings = array( "" => 0, "\0" => 0, "x" => 1, // Double-width chinese character. "\xe6\x9d\xb1" => 2, ); foreach ($strings as $str => $expect) { $this->assertEqual( $expect, phutil_utf8_console_strlen($str), 'Console Length of '.$str); } } public function testUTF8shorten() { $inputs = array( array("1erp derp derp", 9, "", "1erp derp"), array("2erp derp derp", 12, "...", "2erp derp..."), array("derpxderpxderp", 12, "...", "derpxderp..."), array("derp\xE2\x99\x83derpderp", 12, "...", "derp\xE2\x99\x83derp..."), array("", 12, "...", ""), array("derp", 12, "...", "derp"), array("11111", 5, "2222", "11111"), array("111111", 5, "2222", "12222"), array("D1rp. Derp derp.", 7, "...", "D1rp."), array("D2rp. Derp derp.", 5, "...", "D2rp."), array("D3rp. Derp derp.", 4, "...", "D..."), array("D4rp. Derp derp.", 14, "...", "D4rp. Derp..."), array("D5rpderp, derp derp", 16, "...", "D5rpderp..."), array("D6rpderp, derp derp", 17, "...", "D6rpderp, derp..."), // This behavior is maybe a little bad, but it seems mostly reasonable, // at least for latin languages. array("Derp, supercalafragalisticexpialadoshus", 30, "...", "Derp..."), // If a string has only word-break characters in it, we should just cut // it, not produce only the terminal. array("((((((((((", 8, '...', '(((((...'), ); foreach ($inputs as $input) { list($string, $length, $terminal, $expect) = $input; $result = phutil_utf8_shorten($string, $length, $terminal); $this->assertEqual($expect, $result, 'Shortening of '.$string); } try { phutil_utf8_shorten('derp', 3, 'quack'); $caught = false; } catch (Exception $ex) { $caught = true; } $this->assertEqual(true, $caught, 'Expect exception for terminal.'); } public function testUTF8Wrap() { $inputs = array( array( 'aaaaaaa', 3, array( 'aaa', 'aaa', 'a', )), array( 'aaaaaaa', 3, array( 'aaa', 'aaa', 'a', )), array( 'aa&aaaa', 3, array( 'aa&', 'aaa', 'a', )), array( "aa\xe6\x9d\xb1aaaa", 3, array( "aa\xe6\x9d\xb1", 'aaa', 'a', )), array( '', 80, array( )), array( 'a', 80, array( 'a', )), ); foreach ($inputs as $input) { list($string, $width, $expect) = $input; $this->assertEqual( $expect, phutil_utf8_hard_wrap_html($string, $width), "Wrapping of '".$string."'"); } } public function testUTF8NonHTMLWrap() { $inputs = array( array( 'aaaaaaa', 3, array( 'aaa', 'aaa', 'a', )), array( 'abracadabra!', 4, array( 'abra', 'cada', 'bra!', )), array( '', 10, array( )), array( 'a', 20, array( 'a', )), array( "aa\xe6\x9d\xb1aaaa", 3, array( "aa\xe6\x9d\xb1", 'aaa', 'a', )), array( "mmm\nmmm\nmmmm", 3, array( 'mmm', 'mmm', 'mmm', 'm', )), ); foreach ($inputs as $input) { list($string, $width, $expect) = $input; $this->assertEqual( $expect, phutil_utf8_hard_wrap($string, $width), "Wrapping of '".$string."'"); } } public function testUTF8ConvertParams() { $caught = null; try { phutil_utf8_convert('', 'utf8', ''); } catch (Exception $ex) { $caught = $ex; } $this->assertEqual(true, (bool)$caught, 'Requires source encoding.'); $caught = null; try { phutil_utf8_convert('', '', 'utf8'); } catch (Exception $ex) { $caught = $ex; } $this->assertEqual(true, (bool)$caught, 'Requires target encoding.'); } public function testUTF8Convert() { if (!function_exists('mb_convert_encoding')) { $this->assertSkipped("Requires mbstring extension."); } // "[ae]gis se[n]or [(c)] 1970 [+/-] 1 [degree]" $input = "\xE6gis SE\xD1OR \xA9 1970 \xB11\xB0"; $expect = "\xC3\xA6gis SE\xC3\x91OR \xC2\xA9 1970 \xC2\xB11\xC2\xB0"; $output = phutil_utf8_convert($input, 'UTF-8', 'ISO-8859-1'); $this->assertEqual($expect, $output, 'Conversion from ISO-8859-1.'); $caught = null; try { phutil_utf8_convert('xyz', 'moon language', 'UTF-8'); } catch (Exception $ex) { $caught = $ex; } $this->assertEqual(true, (bool)$caught, 'Conversion with bogus encoding.'); } public function testUTF8ucwords() { $tests = array( '' => '', 'x' => 'X', 'X' => 'X', 'five short graybles' => 'Five Short Graybles', 'xXxSNiPeRKiLLeRxXx' => 'XXxSNiPeRKiLLeRxXx', ); foreach ($tests as $input => $expect) { $this->assertEqual( $expect, phutil_utf8_ucwords($input), 'phutil_utf8_ucwords("'.$input.'")'); } } public function testUTF8strtolower() { $tests = array( '' => '', 'a' => 'a', 'A' => 'a', '!' => '!', 'OMG!~ LOLolol ROFLwaffle11~' => 'omg!~ lololol roflwaffle11~', "\xE2\x98\x83" => "\xE2\x98\x83", ); foreach ($tests as $input => $expect) { $this->assertEqual( $expect, phutil_utf8_strtolower($input), 'phutil_utf8_strtolower("'.$input.'")'); } } public function testUTF8strtoupper() { $tests = array( '' => '', 'a' => 'A', 'A' => 'A', '!' => '!', 'Cats have 9 lives.' => 'CATS HAVE 9 LIVES.', "\xE2\x98\x83" => "\xE2\x98\x83", ); foreach ($tests as $input => $expect) { $this->assertEqual( $expect, phutil_utf8_strtoupper($input), 'phutil_utf8_strtoupper("'.$input.'")'); } } public function testUTF8IsCombiningCharacter() { $character = "\xCD\xA0"; $this->assertEqual( true, phutil_utf8_is_combining_character($character)); $character = 'a'; $this->assertEqual( false, phutil_utf8_is_combining_character($character)); } + public function testUTF8vCombined() { + // Empty string. + $string = ''; + $this->assertEqual(array(), phutil_utf8v_combined($string)); + + // Single character. + $string = 'x'; + $this->assertEqual(array('x'), phutil_utf8v_combined($string)); + + // No combining characters. + $string = 'cat'; + $this->assertEqual(array('c', 'a', 't'), phutil_utf8v_combined($string)); + + // String with a combining character in the middle. + $string = "ca\xCD\xA0t"; + $this->assertEqual( + array('c', "a\xCD\xA0", 't'), + phutil_utf8v_combined($string)); + + // String starting with a combined character. + $string = "c\xCD\xA0at"; + $this->assertEqual( + array("c\xCD\xA0", 'a', 't'), + phutil_utf8v_combined($string)); + + // String with trailing combining character. + $string = "cat\xCD\xA0"; + $this->assertEqual( + array('c', 'a', "t\xCD\xA0"), + phutil_utf8v_combined($string)); + + // String with muliple combined characters. + $string = "c\xCD\xA0a\xCD\xA0t\xCD\xA0"; + $this->assertEqual( + array("c\xCD\xA0", "a\xCD\xA0", "t\xCD\xA0"), + phutil_utf8v_combined($string)); + + // String with multiple combining characters. + $string = "ca\xCD\xA0\xCD\xA0t"; + $this->assertEqual( + array('c', "a\xCD\xA0\xCD\xA0", 't'), + phutil_utf8v_combined($string)); + + // String beginning with a combining character. + $string = "\xCD\xA0\xCD\xA0c"; + $this->assertEqual( + array(" \xCD\xA0\xCD\xA0", 'c'), + phutil_utf8v_combined($string)); + + } } diff --git a/src/utils/utf8.php b/src/utils/utf8.php index 2d354b9..6b7a327 100644 --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -1,650 +1,691 @@ = 0x1100 && ($c <= 0x115f || /* Hangul Jamo init. consonants */ $c == 0x2329 || $c == 0x232a || ($c >= 0x2e80 && $c <= 0xa4cf && $c != 0x303f) || /* CJK ... Yi */ ($c >= 0xac00 && $c <= 0xd7a3) || /* Hangul Syllables */ ($c >= 0xf900 && $c <= 0xfaff) || /* CJK Compatibility Ideographs */ ($c >= 0xfe10 && $c <= 0xfe19) || /* Vertical forms */ ($c >= 0xfe30 && $c <= 0xfe6f) || /* CJK Compatibility Forms */ ($c >= 0xff00 && $c <= 0xff60) || /* Fullwidth Forms */ ($c >= 0xffe0 && $c <= 0xffe6) || ($c >= 0x20000 && $c <= 0x2fffd) || ($c >= 0x30000 && $c <= 0x3fffd))); } return $len; } /** * Split a UTF-8 string into an array of characters. Combining characters are * also split. * * @param string A valid utf-8 string. * @return list A list of characters in the string. * @group utf8 */ function phutil_utf8v($string) { $res = array(); $len = strlen($string); $ii = 0; while ($ii < $len) { $byte = $string[$ii]; if ($byte <= "\x7F") { $res[] = $byte; $ii += 1; continue; } else if ($byte < "\xC0") { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } else if ($byte <= "\xDF") { $seq_len = 2; } else if ($byte <= "\xEF") { $seq_len = 3; } else if ($byte <= "\xF7") { $seq_len = 4; } else if ($byte <= "\xFB") { $seq_len = 5; } else if ($byte <= "\xFD") { $seq_len = 6; } else { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } if ($ii + $seq_len > $len) { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } for ($jj = 1; $jj < $seq_len; ++$jj) { if ($string[$ii + $jj] >= "\xC0") { throw new Exception("Invalid UTF-8 string passed to phutil_utf8v()."); } } $res[] = substr($string, $ii, $seq_len); $ii += $seq_len; } return $res; } /** * Split a UTF-8 string into an array of codepoints (as integers). * * @param string A valid UTF-8 string. * @return list A list of codepoints, as integers. * @group utf8 */ function phutil_utf8v_codepoints($string) { $str_v = phutil_utf8v($string); foreach ($str_v as $key => $char) { $c = ord($char[0]); $v = 0; if (($c & 0x80) == 0) { $v = $c; } else if (($c & 0xE0) == 0xC0) { $v = (($c & 0x1F) << 6) + ((ord($char[1]) & 0x3F)); } else if (($c & 0xF0) == 0xE0) { $v = (($c & 0x0F) << 12) + ((ord($char[1]) & 0x3f) << 6) + ((ord($char[2]) & 0x3f)); } else if (($c & 0xF8) == 0xF0) { $v = (($c & 0x07) << 18) + ((ord($char[1]) & 0x3F) << 12) + ((ord($char[2]) & 0x3F) << 6) + ((ord($char[3]) & 0x3f)); } else if (($c & 0xFC) == 0xF8) { $v = (($c & 0x03) << 24) + ((ord($char[1]) & 0x3F) << 18) + ((ord($char[2]) & 0x3F) << 12) + ((ord($char[3]) & 0x3f) << 6) + ((ord($char[4]) & 0x3f)); } else if (($c & 0xFE) == 0xFC) { $v = (($c & 0x01) << 30) + ((ord($char[1]) & 0x3F) << 24) + ((ord($char[2]) & 0x3F) << 18) + ((ord($char[3]) & 0x3f) << 12) + ((ord($char[4]) & 0x3f) << 6) + ((ord($char[5]) & 0x3f)); } $str_v[$key] = $v; } return $str_v; } /** * Shorten a string to provide a summary, respecting UTF-8 characters. This * function attempts to truncate strings at word boundaries. * * NOTE: This function makes a best effort to apply some reasonable rules but * will not work well for the full range of unicode languages. For instance, * no effort is made to deal with combining characters. * * @param string UTF-8 string to shorten. * @param int Maximum length of the result. * @param string If the string is shortened, add this at the end. Defaults to * horizontal ellipsis. * @return string A string with no more than the specified character length. * * @group utf8 */ function phutil_utf8_shorten($string, $length, $terminal = "\xE2\x80\xA6") { $terminal_len = phutil_utf8_strlen($terminal); if ($terminal_len >= $length) { // If you provide a terminal we still enforce that the result (including // the terminal) is no longer than $length, but we can't do that if the // terminal is too long. throw new Exception( "String terminal length must be less than string length!"); } $string_v = phutil_utf8v($string); $string_len = count($string_v); if ($string_len <= $length) { // If the string is already shorter than the requested length, simply return // it unmodified. return $string; } // NOTE: This is not complete, and there are many other word boundary // characters and reasonable places to break words in the UTF-8 character // space. For now, this gives us reasonable behavior for latin langauges. We // don't necessarily have access to PCRE+Unicode so there isn't a great way // for us to look up character attributes. // If we encounter these, prefer to break on them instead of cutting the // string off in the middle of a word. static $break_characters = array( ' ' => true, "\n" => true, ';' => true, ':' => true, '[' => true, '(' => true, ',' => true, '-' => true, ); // If we encounter these, shorten to this character exactly without appending // the terminal. static $stop_characters = array( '.' => true, '!' => true, '?' => true, ); // Search backward in the string, looking for reasonable places to break it. $word_boundary = null; $stop_boundary = null; // If we do a word break with a terminal, we have to look beyond at least the // number of characters in the terminal. $terminal_area = $length - $terminal_len; for ($ii = $length; $ii >= 0; $ii--) { $c = $string_v[$ii]; if (isset($break_characters[$c]) && ($ii <= $terminal_area)) { $word_boundary = $ii; } else if (isset($stop_characters[$c]) && ($ii < $length)) { $stop_boundary = $ii + 1; break; } else { if ($word_boundary !== null) { break; } } } if ($stop_boundary !== null) { // We found a character like ".". Cut the string there, without appending // the terminal. $string_part = array_slice($string_v, 0, $stop_boundary); return implode('', $string_part); } // If we didn't find any boundary characters or we found ONLY boundary // characters, just break at the maximum character length. if ($word_boundary === null || $word_boundary === 0) { $word_boundary = $length - $terminal_len; } $string_part = array_slice($string_v, 0, $word_boundary); $string_part = implode('', $string_part); return $string_part.$terminal; } /** * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities. * * @param string An HTML string with tags and entities. * @return list List of hard-wrapped lines. * @group utf8 */ function phutil_utf8_hard_wrap_html($string, $width) { $break_here = array(); // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($string); $len = count($vector); $char_pos = 0; for ($ii = 0; $ii < $len; ++$ii) { // An ampersand indicates an HTML entity; consume the whole thing (until // ";") but treat it all as one character. if ($vector[$ii] == '&') { do { ++$ii; } while ($vector[$ii] != ';'); ++$char_pos; // An "<" indicates an HTML tag, consume the whole thing but don't treat // it as a character. } else if ($vector[$ii] == '<') { do { ++$ii; } while ($vector[$ii] != '>'); } else { ++$char_pos; } // Keep track of where we need to break the string later. if ($char_pos == $width) { $break_here[$ii] = true; $char_pos = 0; } } $result = array(); $string = ''; foreach ($vector as $ii => $char) { $string .= $char; if (isset($break_here[$ii])) { $result[] = $string; $string = ''; } } if (strlen($string)) { $result[] = $string; } return $result; } /** * Hard-wrap a block of UTF-8 text with no embedded HTML tags and entitites * * @param string A non HTML string * @param int Width of the hard-wrapped lines * @return list List of hard-wrapped lines. * @group utf8 */ function phutil_utf8_hard_wrap($string, $width) { $result = array(); $lines = phutil_split_lines($string, $retain_endings = false); foreach ($lines as $line) { // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($line); $len = count($vector); $buffer = ''; for ($ii = 1; $ii <= $len; ++$ii) { $buffer .= $vector[$ii - 1]; if (($ii % $width) === 0) { $result[] = $buffer; $buffer = ''; } } if (strlen($buffer)) { $result[] = $buffer; } } return $result; } /** * Convert a string from one encoding (like ISO-8859-1) to another encoding * (like UTF-8). * * This is primarily a thin wrapper around `mb_convert_encoding()` which checks * you have the extension installed, since we try to require the extension * only if you actually need it (i.e., you want to work with encodings other * than UTF-8). * * NOTE: This function assumes that the input is in the given source encoding. * If it is not, it may not output in the specified target encoding. If you * need to perform a hard conversion to UTF-8, use this function in conjunction * with @{function:phutil_utf8ize}. We can detect failures caused by invalid * encoding names, but `mb_convert_encoding()` fails silently if the * encoding name identifies a real encoding but the string is not actually * encoded with that encoding. * * @param string String to re-encode. * @param string Target encoding name, like "UTF-8". * @param string Source endocing name, like "ISO-8859-1". * @return string Input string, with converted character encoding. * * @group utf8 * * @phutil-external-symbol function mb_convert_encoding */ function phutil_utf8_convert($string, $to_encoding, $from_encoding) { if (!$from_encoding) { throw new InvalidArgumentException( "Attempting to convert a string encoding, but no source encoding ". "was provided. Explicitly provide the source encoding."); } if (!$to_encoding) { throw new InvalidArgumentException( "Attempting to convert a string encoding, but no target encoding ". "was provided. Explicitly provide the target encoding."); } // Normalize encoding names so we can no-op the very common case of UTF8 // to UTF8 (or any other conversion where both encodings are identical). $to_upper = strtoupper(str_replace('-', '', $to_encoding)); $from_upper = strtoupper(str_replace('-', '', $from_encoding)); if ($from_upper == $to_upper) { return $string; } if (!function_exists('mb_convert_encoding')) { throw new Exception( "Attempting to convert a string encoding from '{$from_encoding}' ". "to '{$to_encoding}', but the 'mbstring' PHP extension is not ". "available. Install mbstring to work with encodings other than ". "UTF-8."); } $result = @mb_convert_encoding($string, $to_encoding, $from_encoding); if ($result === false) { $message = error_get_last(); if ($message) { $message = idx($message, 'message', 'Unknown error.'); } throw new Exception( "String conversion from encoding '{$from_encoding}' to encoding ". "'{$to_encoding}' failed: {$message}"); } return $result; } /** * Convert a string to title case in a UTF8-aware way. This function doesn't * necessarily do a great job, but the builtin implementation of ucwords() can * completely destroy inputs, so it just has to be better than that. Similar to * @{function:ucwords}. * * @param string UTF-8 input string. * @return string Input, in some semblance of title case. * * @group utf8 */ function phutil_utf8_ucwords($str) { // NOTE: mb_convert_case() discards uppercase letters in words when converting // to title case. For example, it will convert "AAA" into "Aaa", which is // undesirable. $v = phutil_utf8v($str); $result = ''; $last = null; $ord_a = ord('a'); $ord_z = ord('z'); foreach ($v as $c) { $convert = false; if ($last === null || $last === ' ') { $o = ord($c[0]); if ($o >= $ord_a && $o <= $ord_z) { $convert = true; } } if ($convert) { $result .= phutil_utf8_strtoupper($c); } else { $result .= $c; } $last = $c; } return $result; } /** * Convert a string to lower case in a UTF8-aware way. Similar to * @{function:strtolower}. * * @param string UTF-8 input string. * @return string Input, in some semblance of lower case. * * @group utf8 * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtolower($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_LOWER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('A', 'Z'), range('a', 'z')); } return phutil_utf8_strtr($str, $map); } /** * Convert a string to upper case in a UTF8-aware way. Similar to * @{function:strtoupper}. * * @param string UTF-8 input string. * @return string Input, in some semblance of upper case. * * @group utf8 * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtoupper($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_UPPER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('a', 'z'), range('A', 'Z')); } return phutil_utf8_strtr($str, $map); } /** * Replace characters in a string in a UTF-aware way. Similar to * @{function:strtr}. * * @param string UTF-8 input string. * @param map Map of characters to replace. * @return string Input with translated characters. * * @group utf8 */ function phutil_utf8_strtr($str, array $map) { $v = phutil_utf8v($str); $result = ''; foreach ($v as $c) { if (isset($map[$c])) { $result .= $map[$c]; } else { $result .= $c; } } return $result; } /** * Determine if a given unicode character is a combining character or not. * * @param string A single unicode character. * @return boolean True or false. * * @group utf8 */ function phutil_utf8_is_combining_character($character) { $components = phutil_utf8v_codepoints($character); // Combining Diacritical Marks (0300 - 036F). // Combining Diacritical Marks Supplement (1DC0 - 1DFF). // Combining Diacritical Marks for Symbols (20D0 - 20FF). // Combining Half Marks (FE20 - FE2F). foreach ($components as $codepoint) { if ($codepoint >= 0x0300 && $codepoint <= 0x036F || $codepoint >= 0x1DC0 && $codepoint <= 0x1DFF || $codepoint >= 0x20D0 && $codepoint <= 0x20FF || $codepoint >= 0xFE20 && $codepoint <= 0xFE2F) { return true; } } return false; } + +/** + * Split a UTF-8 string into an array of characters. Combining characters + * are not split. + * + * @param string A valid utf-8 string. + * @return list A list of characters in the string. + * + * @group utf8 + */ + +function phutil_utf8v_combined($string) { + $components = phutil_utf8v($string); + $array_length = count($components); + + // If the first character in the string is a combining character, + // prepend a space to the string. + if ( + $array_length > 0 && + phutil_utf8_is_combining_character($components[0])) { + $string = " ".$string; + $components = phutil_utf8v($string); + $array_length++; + } + + for ($index = 1; $index < $array_length; $index++) { + if (phutil_utf8_is_combining_character($components[$index])) { + $components[$index - 1] = + $components[$index - 1].$components[$index]; + + unset($components[$index]); + $components = array_values($components); + + $index --; + $array_length = count($components); + } + } + + return $components; +} +