diff --git a/resources/test/diverse_symbols.php b/resources/test/diverse_symbols.php index abe9632..5476345 100644 --- a/resources/test/diverse_symbols.php +++ b/resources/test/diverse_symbols.php @@ -1,56 +1,56 @@ assertTrue($caught instanceof InvalidArgumentException); } public function testMFilterWithEmptyValueFiltered() { $a = new MFilterTestHelper('o', 'p', 'q'); $b = new MFilterTestHelper('o', '', 'q'); $c = new MFilterTestHelper('o', 'p', 'q'); $list = array( 'a' => $a, 'b' => $b, 'c' => $c, ); $actual = mfilter($list, 'getI'); $expected = array( 'a' => $a, 'c' => $c, ); $this->assertEqual($expected, $actual); } public function testMFilterWithEmptyValueNegateFiltered() { $a = new MFilterTestHelper('o', 'p', 'q'); $b = new MFilterTestHelper('o', '', 'q'); $c = new MFilterTestHelper('o', 'p', 'q'); $list = array( 'a' => $a, 'b' => $b, 'c' => $c, ); $actual = mfilter($list, 'getI', true); $expected = array( 'b' => $b, ); $this->assertEqual($expected, $actual); } public function testIFilterInvalidIndexThrowException() { $caught = null; try { ifilter(array(), null); } catch (InvalidArgumentException $ex) { $caught = $ex; } $this->assertTrue($caught instanceof InvalidArgumentException); } public function testIFilterWithEmptyValueFiltered() { $list = array( 'a' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'b' => array('h' => 'o', 'i' => '', 'j' => 'q'), 'c' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'd' => array('h' => 'o', 'i' => 0, 'j' => 'q'), 'e' => array('h' => 'o', 'i' => null, 'j' => 'q'), 'f' => array('h' => 'o', 'i' => false, 'j' => 'q'), ); $actual = ifilter($list, 'i'); $expected = array( 'a' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'c' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), ); $this->assertEqual($expected, $actual); } public function testIFilterIndexNotExistsAllFiltered() { $list = array( 'a' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'b' => array('h' => 'o', 'i' => '', 'j' => 'q'), ); $actual = ifilter($list, 'NoneExisting'); $expected = array(); $this->assertEqual($expected, $actual); } public function testIFilterWithEmptyValueNegateFiltered() { $list = array( 'a' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'b' => array('h' => 'o', 'i' => '', 'j' => 'q'), 'c' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'd' => array('h' => 'o', 'i' => 0, 'j' => 'q'), 'e' => array('h' => 'o', 'i' => null, 'j' => 'q'), 'f' => array('h' => 'o', 'i' => false, 'j' => 'q'), ); $actual = ifilter($list, 'i', true); $expected = array( 'b' => array('h' => 'o', 'i' => '', 'j' => 'q'), 'd' => array('h' => 'o', 'i' => 0, 'j' => 'q'), 'e' => array('h' => 'o', 'i' => null, 'j' => 'q'), 'f' => array('h' => 'o', 'i' => false, 'j' => 'q'), ); $this->assertEqual($expected, $actual); } public function testIFilterIndexNotExistsNotFiltered() { $list = array( 'a' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'b' => array('h' => 'o', 'i' => '', 'j' => 'q'), ); $actual = ifilter($list, 'NoneExisting', true); $expected = array( 'a' => array('h' => 'o', 'i' => 'p', 'j' => 'q'), 'b' => array('h' => 'o', 'i' => '', 'j' => 'q'), ); $this->assertEqual($expected, $actual); } public function testmergevMergingBasicallyWorksCorrectly() { $this->assertEqual( array(), array_mergev( array( // ))); $this->assertEqual( array(), array_mergev( array( array(), array(), array(), ))); $this->assertEqual( array(1, 2, 3, 4, 5), array_mergev( array( array(1, 2), array(3), array(), array(4, 5), ))); $not_valid = array( 'scalar' => array(1), 'array plus scalar' => array(array(), 1), 'null' => array(null), ); foreach ($not_valid as $key => $invalid_input) { $caught = null; try { array_mergev($invalid_input); } catch (InvalidArgumentException $ex) { $caught = $ex; } $this->assertTrue( ($caught instanceof InvalidArgumentException), pht('%s invalid on %s', 'array_mergev()', $key)); } } public function testNonempty() { $this->assertEqual( 'zebra', nonempty(false, null, 0, '', array(), 'zebra')); $this->assertEqual( null, nonempty()); $this->assertEqual( false, nonempty(null, false)); $this->assertEqual( null, nonempty(false, null)); } protected function tryAssertInstancesOfArray($input) { assert_instances_of($input, 'array'); } protected function tryAssertInstancesOfStdClass($input) { assert_instances_of($input, 'stdClass'); } public function testAssertInstancesOf() { $object = new stdClass(); $inputs = array( 'empty' => array(), 'stdClass' => array($object, $object), __CLASS__ => array($object, $this), 'array' => array(array(), array()), 'integer' => array($object, 1), ); $this->tryTestCases( $inputs, array(true, true, false, false, false), array($this, 'tryAssertInstancesOfStdClass'), 'InvalidArgumentException'); $this->tryTestCases( $inputs, array(true, false, false, true, false), array($this, 'tryAssertInstancesOfArray'), 'InvalidArgumentException'); } - public function testAssertStringLike () { + public function testAssertStringLike() { $this->assertEqual( null, assert_stringlike(null)); $this->assertEqual( null, assert_stringlike('')); $this->assertEqual( null, assert_stringlike('Hello World')); $this->assertEqual( null, assert_stringlike(1)); $this->assertEqual( null, assert_stringlike(9.9999)); $this->assertEqual( null, assert_stringlike(true)); $obj = new Exception('.'); $this->assertEqual( null, assert_stringlike($obj)); $obj = (object)array(); try { assert_stringlike($obj); } catch (InvalidArgumentException $ex) { $caught = $ex; } $this->assertTrue($caught instanceof InvalidArgumentException); $array = array( 'foo' => 'bar', 'bar' => 'foo', ); try { assert_stringlike($array); } catch (InvalidArgumentException $ex) { $caught = $ex; } $this->assertTrue($caught instanceof InvalidArgumentException); $tmp = new TempFile(); $resource = fopen($tmp, 'r'); try { assert_stringlike($resource); } catch (InvalidArgumentException $ex) { $caught = $ex; } fclose($resource); $this->assertTrue($caught instanceof InvalidArgumentException); } public function testCoalesce() { $this->assertEqual( 'zebra', coalesce(null, 'zebra')); $this->assertEqual( null, coalesce()); $this->assertEqual( false, coalesce(false, null)); $this->assertEqual( false, coalesce(null, false)); } public function testHeadLast() { $this->assertEqual( 'a', head(explode('.', 'a.b'))); $this->assertEqual( 'b', last(explode('.', 'a.b'))); } public function testHeadKeyLastKey() { $this->assertEqual( 'a', head_key(array('a' => 0, 'b' => 1))); $this->assertEqual( 'b', last_key(array('a' => 0, 'b' => 1))); $this->assertEqual(null, head_key(array())); $this->assertEqual(null, last_key(array())); } public function testID() { $this->assertEqual(true, id(true)); $this->assertEqual(false, id(false)); } public function testIdx() { $array = array( 'present' => true, 'null' => null, ); $this->assertEqual(true, idx($array, 'present')); $this->assertEqual(true, idx($array, 'present', false)); $this->assertEqual(null, idx($array, 'null')); $this->assertEqual(null, idx($array, 'null', false)); $this->assertEqual(null, idx($array, 'missing')); $this->assertEqual(false, idx($array, 'missing', false)); } public function testSplitLines() { $retain_cases = array( '' => array(''), 'x' => array('x'), "x\n" => array("x\n"), "\n" => array("\n"), "\n\n\n" => array("\n", "\n", "\n"), "\r\n" => array("\r\n"), "x\r\ny\n" => array("x\r\n", "y\n"), "x\ry\nz\r\n" => array("x\ry\n", "z\r\n"), "x\ry\nz\r\n\n" => array("x\ry\n", "z\r\n", "\n"), ); foreach ($retain_cases as $input => $expect) { $this->assertEqual( $expect, phutil_split_lines($input, $retain_endings = true), pht('(Retained) %s', addcslashes($input, "\r\n\\"))); } $discard_cases = array( '' => array(''), 'x' => array('x'), "x\n" => array('x'), "\n" => array(''), "\n\n\n" => array('', '', ''), "\r\n" => array(''), "x\r\ny\n" => array('x', 'y'), "x\ry\nz\r\n" => array("x\ry", 'z'), "x\ry\nz\r\n\n" => array("x\ry", 'z', ''), ); foreach ($discard_cases as $input => $expect) { $this->assertEqual( $expect, phutil_split_lines($input, $retain_endings = false), pht('(Discarded) %s', addcslashes($input, "\r\n\\"))); } } public function testArrayFuse() { $this->assertEqual(array(), array_fuse(array())); $this->assertEqual(array('x' => 'x'), array_fuse(array('x'))); } public function testArrayInterleave() { $this->assertEqual(array(), array_interleave('x', array())); $this->assertEqual(array('y'), array_interleave('x', array('y'))); $this->assertEqual( array('y', 'x', 'z'), array_interleave('x', array('y', 'z'))); $this->assertEqual( array('y', 'x', 'z'), array_interleave( 'x', array( 'kangaroo' => 'y', 'marmoset' => 'z', ))); $obj1 = (object)array(); $obj2 = (object)array(); $this->assertEqual( array($obj1, $obj2, $obj1, $obj2, $obj1), array_interleave( $obj2, array( $obj1, $obj1, $obj1, ))); $implode_tests = array( '' => array(1, 2, 3), 'x' => array(1, 2, 3), 'y' => array(), 'z' => array(1), ); foreach ($implode_tests as $x => $y) { $this->assertEqual( implode('', array_interleave($x, $y)), implode($x, $y)); } } public function testLoggableString() { $this->assertEqual( '', phutil_loggable_string('')); $this->assertEqual( "a\\nb", phutil_loggable_string("a\nb")); $this->assertEqual( "a\\x01b", phutil_loggable_string("a\x01b")); $this->assertEqual( "a\\x1Fb", phutil_loggable_string("a\x1Fb")); } public function testPhutilUnits() { $cases = array( '0 seconds in seconds' => 0, '1 second in seconds' => 1, '2 seconds in seconds' => 2, '100 seconds in seconds' => 100, '2 minutes in seconds' => 120, '1 hour in seconds' => 3600, '1 day in seconds' => 86400, '3 days in seconds' => 259200, ); foreach ($cases as $input => $expect) { $this->assertEqual( $expect, phutil_units($input), 'phutil_units("'.$input.'")'); } $bad_cases = array( 'quack', '3 years in seconds', '1 minute in milliseconds', '1 day in days', '-1 minutes in seconds', '1.5 minutes in seconds', ); foreach ($bad_cases as $input) { $caught = null; try { phutil_units($input); } catch (InvalidArgumentException $ex) { $caught = $ex; } $this->assertTrue( ($caught instanceof InvalidArgumentException), 'phutil_units("'.$input.'")'); } } public function testPhutilJSONDecode() { $valid_cases = array( '{}' => array(), '[]' => array(), '[1, 2]' => array(1, 2), '{"a":"b"}' => array('a' => 'b'), ); foreach ($valid_cases as $input => $expect) { $result = phutil_json_decode($input); $this->assertEqual($expect, $result, 'phutil_json_decode('.$input.')'); } $invalid_cases = array( '', '"a"', '{,}', 'null', '"null"', ); foreach ($invalid_cases as $input) { $caught = null; try { phutil_json_decode($input); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue($caught instanceof PhutilJSONParserException); } } public function testPhutilINIDecode() { // Skip the test if we are using an older version of PHP that doesn't // have the `parse_ini_string` function. try { phutil_ini_decode(''); } catch (PhutilMethodNotImplementedException $ex) { $this->assertSkipped($ex->getMessage()); } $valid_cases = array( '' => array(), 'foo=' => array('foo' => ''), 'foo=bar' => array('foo' => 'bar'), 'foo = bar' => array('foo' => 'bar'), "foo = bar\n" => array('foo' => 'bar'), "foo\nbar = baz" => array('bar' => 'baz'), "[foo]\nbar = baz" => array('foo' => array('bar' => 'baz')), "[foo]\n[bar]\nbaz = foo" => array( 'foo' => array(), 'bar' => array('baz' => 'foo'), ), "[foo]\nbar = baz\n\n[bar]\nbaz = foo" => array( 'foo' => array('bar' => 'baz'), 'bar' => array('baz' => 'foo'), ), "; Comment\n[foo]\nbar = baz" => array('foo' => array('bar' => 'baz')), "# Comment\n[foo]\nbar = baz" => array('foo' => array('bar' => 'baz')), "foo = true\n[bar]\nbaz = false" => array('foo' => true, 'bar' => array('baz' => false)), "foo = 1\nbar = 1.234" => array('foo' => 1, 'bar' => 1.234), 'x = {"foo": "bar"}' => array('x' => '{"foo": "bar"}'), ); foreach ($valid_cases as $input => $expect) { $result = phutil_ini_decode($input); $this->assertEqual($expect, $result, 'phutil_ini_decode('.$input.')'); } $invalid_cases = array( '[' => 'syntax error, unexpected $end, expecting \']\' in Unknown on line 1', ); foreach ($invalid_cases as $input => $expect) { $caught = null; try { phutil_ini_decode($input); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue($caught instanceof PhutilINIParserException); $this->assertEqual($expect, $caught->getMessage()); } } public function testCensorCredentials() { $cases = array( '' => '', 'abc' => 'abc', // NOTE: We're liberal about censoring here, since we can't tell // if this is a truncated password at the end of an input string // or a domain name. The version with a "/" isn't censored. 'http://example.com' => 'http://xxxxx', 'http://example.com/' => 'http://example.com/', 'http://username@example.com' => 'http://xxxxx@example.com', 'http://user:pass@example.com' => 'http://xxxxx@example.com', // We censor these because they might be truncated credentials at the end // of the string. 'http://user' => 'http://xxxxx', "http://user\n" => "http://xxxxx\n", 'svn+ssh://user:pass@example.com' => 'svn+ssh://xxxxx@example.com', ); foreach ($cases as $input => $expect) { $this->assertEqual( $expect, phutil_censor_credentials($input), pht('Credential censoring for: %s', $input)); } } public function testVarExport() { // Constants $this->assertEqual('null', phutil_var_export(null)); $this->assertEqual('true', phutil_var_export(true)); $this->assertEqual('false', phutil_var_export(false)); $this->assertEqual("'quack'", phutil_var_export('quack')); $this->assertEqual('1234567', phutil_var_export(1234567)); // Arrays $this->assertEqual( 'array()', phutil_var_export(array())); $this->assertEqual( implode("\n", array( 'array(', ' 1,', ' 2,', ' 3,', ')', )), phutil_var_export(array(1, 2, 3))); $this->assertEqual( implode("\n", array( 'array(', " 'foo' => 'bar',", " 'bar' => 'baz',", ')', )), phutil_var_export(array('foo' => 'bar', 'bar' => 'baz'))); $this->assertEqual( implode("\n", array( 'array(', " 'foo' => array(", " 'bar' => array(", " 'baz' => array(),", ' ),', ' ),', ')', )), phutil_var_export( array('foo' => array('bar' => array('baz' => array()))))); // Objects $this->assertEqual( "stdClass::__set_state(array(\n))", phutil_var_export(new stdClass())); $this->assertEqual( "PhutilTestPhobject::__set_state(array(\n))", phutil_var_export(new PhutilTestPhobject())); } public function testFnmatch() { $cases = array( '' => array( array(''), array('.', '/'), ), '*' => array( array('file'), array('dir/', '/dir'), ), '**' => array( array('file', 'dir/', '/dir', 'dir/subdir/file'), array(), ), '**/file' => array( array('file', 'dir/file', 'dir/subdir/file', 'dir/subdir/subdir/file'), array('file/', 'file/dir'), ), 'file.*' => array( array('file.php', 'file.a', 'file.'), array('files.php', 'file.php/blah'), ), 'fo?' => array( array('foo', 'fot'), array('fooo', 'ffoo', 'fo/', 'foo/'), ), 'fo{o,t}' => array( array('foo', 'fot'), array('fob', 'fo/', 'foo/'), ), 'fo{o,\\,}' => array( array('foo', 'fo,'), array('foo/', 'fo,/'), ), 'fo{o,\\\\}' => array( array('foo', 'fo\\'), array('foo/', 'fo\\/'), ), '/foo' => array( array('/foo'), array('foo', '/foo/'), ), // Tests for various `fnmatch` flags. '*.txt' => array( array( 'file.txt', // FNM_PERIOD '.secret-file.txt', ), array( // FNM_PATHNAME 'dir/file.txt', // FNM_CASEFOLD 'file.TXT', ), '\\*.txt' => array( array( // FNM_NOESCAPE '*.txt', ), array( 'file.txt', ), ), ), ); $invalid = array( '{', 'asdf\\', ); foreach ($cases as $input => $expect) { list($matches, $no_matches) = $expect; foreach ($matches as $match) { $this->assertTrue( phutil_fnmatch($input, $match), pht('Expecting "%s" to match "%s".', $input, $match)); } foreach ($no_matches as $no_match) { $this->assertFalse( phutil_fnmatch($input, $no_match), pht('Expecting "%s" not to match "%s".', $input, $no_match)); } } foreach ($invalid as $input) { $caught = null; try { phutil_fnmatch($input, ''); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue($caught instanceof InvalidArgumentException); } } public function testJSONEncode() { $in = array( 'example' => "Not Valid UTF8: \x80", ); $caught = null; try { $value = phutil_json_encode($in); } catch (Exception $ex) { $caught = $ex; } $this->assertTrue(($caught instanceof Exception)); } } diff --git a/src/utils/utf8.php b/src/utils/utf8.php index 4114208..bd5d629 100644 --- a/src/utils/utf8.php +++ b/src/utils/utf8.php @@ -1,747 +1,747 @@ for some discussion. Since the // input limit is extremely low (less than 50KB on my system), do this check // very very slowly in PHP instead. See also T5316. $len = strlen($string); for ($ii = 0; $ii < $len; $ii++) { $chr = ord($string[$ii]); if ($chr >= 0x01 && $chr <= 0x7F) { continue; } else if ($chr >= 0xC2 && $chr <= 0xDF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } return false; } else if ($chr > 0xE0 && $chr <= 0xEF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } return false; } else if ($chr == 0xE0) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); // NOTE: This range starts at 0xA0, not 0x80. The values 0x80-0xA0 are // "valid", but not minimal representations, and MySQL rejects them. We're // special casing this part of the range. if ($chr >= 0xA0 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } return false; } else if (!$only_bmp) { if ($chr > 0xF0 && $chr <= 0xF4) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } } } else if ($chr == 0xF0) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); // NOTE: As above, this range starts at 0x90, not 0x80. The values // 0x80-0x90 are not minimal representations. if ($chr >= 0x90 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { ++$ii; if ($ii >= $len) { return false; } $chr = ord($string[$ii]); if ($chr >= 0x80 && $chr <= 0xBF) { continue; } } } } } return false; } return true; } /** * Find the character length of a UTF-8 string. * * @param string A valid utf-8 string. * @return int The character length of the string. */ function phutil_utf8_strlen($string) { return strlen(utf8_decode($string)); } /** * Find the console display length of a UTF-8 string. This may differ from the * character length of the string if it contains double-width characters, like * many Chinese characters. * * This method is based on a C implementation here, which is based on the IEEE * standards. The source has more discussion and addresses more considerations * than this implementation does. * * http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c * * NOTE: We currently assume width 1 for East-Asian ambiguous characters. * * NOTE: This function is VERY slow. * * @param string A valid UTF-8 string. * @return int The console display length of the string. */ function phutil_utf8_console_strlen($string) { // Formatting and colors don't contribute any width in the console. $string = preg_replace("/\x1B\[\d*m/", '', $string); // In the common case of an ASCII string, just return the string length. if (preg_match('/^[\x01-\x7F]*\z/', $string)) { return strlen($string); } $len = 0; // NOTE: To deal with combining characters, we're splitting the string into // glyphs first (characters with combiners) and then counting just the width // of the first character in each glyph. $display_glyphs = phutil_utf8v_combined($string); foreach ($display_glyphs as $display_glyph) { $glyph_codepoints = phutil_utf8v_codepoints($display_glyph); foreach ($glyph_codepoints as $c) { if ($c == 0) { break; } $len += 1 + ($c >= 0x1100 && ($c <= 0x115f || /* Hangul Jamo init. consonants */ $c == 0x2329 || $c == 0x232a || ($c >= 0x2e80 && $c <= 0xa4cf && $c != 0x303f) || /* CJK ... Yi */ ($c >= 0xac00 && $c <= 0xd7a3) || /* Hangul Syllables */ ($c >= 0xf900 && $c <= 0xfaff) || /* CJK Compatibility Ideographs */ ($c >= 0xfe10 && $c <= 0xfe19) || /* Vertical forms */ ($c >= 0xfe30 && $c <= 0xfe6f) || /* CJK Compatibility Forms */ ($c >= 0xff00 && $c <= 0xff60) || /* Fullwidth Forms */ ($c >= 0xffe0 && $c <= 0xffe6) || ($c >= 0x20000 && $c <= 0x2fffd) || ($c >= 0x30000 && $c <= 0x3fffd))); break; } } return $len; } /** * Split a UTF-8 string into an array of characters. Combining characters are * also split. * * @param string A valid utf-8 string. * @return list A list of characters in the string. */ function phutil_utf8v($string) { $res = array(); $len = strlen($string); $ii = 0; while ($ii < $len) { $byte = $string[$ii]; if ($byte <= "\x7F") { $res[] = $byte; $ii += 1; continue; } else if ($byte < "\xC0") { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } else if ($byte <= "\xDF") { $seq_len = 2; } else if ($byte <= "\xEF") { $seq_len = 3; } else if ($byte <= "\xF7") { $seq_len = 4; } else if ($byte <= "\xFB") { $seq_len = 5; } else if ($byte <= "\xFD") { $seq_len = 6; } else { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } if ($ii + $seq_len > $len) { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } for ($jj = 1; $jj < $seq_len; ++$jj) { if ($string[$ii + $jj] >= "\xC0") { throw new Exception( pht('Invalid UTF-8 string passed to %s.', __FUNCTION__)); } } $res[] = substr($string, $ii, $seq_len); $ii += $seq_len; } return $res; } /** * Split a UTF-8 string into an array of codepoints (as integers). * * @param string A valid UTF-8 string. * @return list A list of codepoints, as integers. */ function phutil_utf8v_codepoints($string) { $str_v = phutil_utf8v($string); foreach ($str_v as $key => $char) { $c = ord($char[0]); $v = 0; if (($c & 0x80) == 0) { $v = $c; } else if (($c & 0xE0) == 0xC0) { $v = (($c & 0x1F) << 6) + ((ord($char[1]) & 0x3F)); } else if (($c & 0xF0) == 0xE0) { $v = (($c & 0x0F) << 12) + ((ord($char[1]) & 0x3f) << 6) + ((ord($char[2]) & 0x3f)); } else if (($c & 0xF8) == 0xF0) { $v = (($c & 0x07) << 18) + ((ord($char[1]) & 0x3F) << 12) + ((ord($char[2]) & 0x3F) << 6) + ((ord($char[3]) & 0x3f)); } else if (($c & 0xFC) == 0xF8) { $v = (($c & 0x03) << 24) + ((ord($char[1]) & 0x3F) << 18) + ((ord($char[2]) & 0x3F) << 12) + ((ord($char[3]) & 0x3f) << 6) + ((ord($char[4]) & 0x3f)); } else if (($c & 0xFE) == 0xFC) { $v = (($c & 0x01) << 30) + ((ord($char[1]) & 0x3F) << 24) + ((ord($char[2]) & 0x3F) << 18) + ((ord($char[3]) & 0x3f) << 12) + ((ord($char[4]) & 0x3f) << 6) + ((ord($char[5]) & 0x3f)); } $str_v[$key] = $v; } return $str_v; } /** * Hard-wrap a block of UTF-8 text with embedded HTML tags and entities. * * @param string An HTML string with tags and entities. * @return list List of hard-wrapped lines. */ function phutil_utf8_hard_wrap_html($string, $width) { $break_here = array(); // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($string); $len = count($vector); $char_pos = 0; for ($ii = 0; $ii < $len; ++$ii) { // An ampersand indicates an HTML entity; consume the whole thing (until // ";") but treat it all as one character. if ($vector[$ii] == '&') { do { ++$ii; } while ($vector[$ii] != ';'); ++$char_pos; // An "<" indicates an HTML tag, consume the whole thing but don't treat // it as a character. } else if ($vector[$ii] == '<') { do { ++$ii; } while ($vector[$ii] != '>'); } else { ++$char_pos; } // Keep track of where we need to break the string later. if ($char_pos == $width) { $break_here[$ii] = true; $char_pos = 0; } } $result = array(); $string = ''; foreach ($vector as $ii => $char) { $string .= $char; if (isset($break_here[$ii])) { $result[] = $string; $string = ''; } } if (strlen($string)) { $result[] = $string; } return $result; } /** * Hard-wrap a block of UTF-8 text with no embedded HTML tags and entities. * * @param string A non HTML string * @param int Width of the hard-wrapped lines * @return list List of hard-wrapped lines. */ function phutil_utf8_hard_wrap($string, $width) { $result = array(); $lines = phutil_split_lines($string, $retain_endings = false); foreach ($lines as $line) { // Convert the UTF-8 string into a list of UTF-8 characters. $vector = phutil_utf8v($line); $len = count($vector); $buffer = ''; for ($ii = 1; $ii <= $len; ++$ii) { $buffer .= $vector[$ii - 1]; if (($ii % $width) === 0) { $result[] = $buffer; $buffer = ''; } } if (strlen($buffer)) { $result[] = $buffer; } } return $result; } /** * Convert a string from one encoding (like ISO-8859-1) to another encoding * (like UTF-8). * * This is primarily a thin wrapper around `mb_convert_encoding()` which checks * you have the extension installed, since we try to require the extension * only if you actually need it (i.e., you want to work with encodings other * than UTF-8). * * NOTE: This function assumes that the input is in the given source encoding. * If it is not, it may not output in the specified target encoding. If you * need to perform a hard conversion to UTF-8, use this function in conjunction * with @{function:phutil_utf8ize}. We can detect failures caused by invalid * encoding names, but `mb_convert_encoding()` fails silently if the * encoding name identifies a real encoding but the string is not actually * encoded with that encoding. * * @param string String to re-encode. * @param string Target encoding name, like "UTF-8". * @param string Source encoding name, like "ISO-8859-1". * @return string Input string, with converted character encoding. * * @phutil-external-symbol function mb_convert_encoding */ function phutil_utf8_convert($string, $to_encoding, $from_encoding) { if (!$from_encoding) { throw new InvalidArgumentException( pht( 'Attempting to convert a string encoding, but no source encoding '. 'was provided. Explicitly provide the source encoding.')); } if (!$to_encoding) { throw new InvalidArgumentException( pht( 'Attempting to convert a string encoding, but no target encoding '. 'was provided. Explicitly provide the target encoding.')); } // Normalize encoding names so we can no-op the very common case of UTF8 // to UTF8 (or any other conversion where both encodings are identical). $to_upper = strtoupper(str_replace('-', '', $to_encoding)); $from_upper = strtoupper(str_replace('-', '', $from_encoding)); if ($from_upper == $to_upper) { return $string; } if (!function_exists('mb_convert_encoding')) { throw new Exception( pht( "Attempting to convert a string encoding from '%s' to '%s', ". "but the '%s' PHP extension is not available. Install %s to ". "work with encodings other than UTF-8.", $from_encoding, $to_encoding, 'mbstring', 'mbstring')); } $result = @mb_convert_encoding($string, $to_encoding, $from_encoding); if ($result === false) { $message = error_get_last(); if ($message) { $message = idx($message, 'message', pht('Unknown error.')); } throw new Exception( pht( "String conversion from encoding '%s' to encoding '%s' failed: %s", $from_encoding, $to_encoding, $message)); } return $result; } /** * Convert a string to title case in a UTF8-aware way. This function doesn't * necessarily do a great job, but the builtin implementation of `ucwords()` can * completely destroy inputs, so it just has to be better than that. Similar to * @{function:ucwords}. * * @param string UTF-8 input string. * @return string Input, in some semblance of title case. */ function phutil_utf8_ucwords($str) { // NOTE: mb_convert_case() discards uppercase letters in words when converting // to title case. For example, it will convert "AAA" into "Aaa", which is // undesirable. $v = phutil_utf8v($str); $result = ''; $last = null; $ord_a = ord('a'); $ord_z = ord('z'); foreach ($v as $c) { $convert = false; if ($last === null || $last === ' ') { $o = ord($c[0]); if ($o >= $ord_a && $o <= $ord_z) { $convert = true; } } if ($convert) { $result .= phutil_utf8_strtoupper($c); } else { $result .= $c; } $last = $c; } return $result; } /** * Convert a string to lower case in a UTF8-aware way. Similar to * @{function:strtolower}. * * @param string UTF-8 input string. * @return string Input, in some semblance of lower case. * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtolower($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_LOWER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('A', 'Z'), range('a', 'z')); } return phutil_utf8_strtr($str, $map); } /** * Convert a string to upper case in a UTF8-aware way. Similar to * @{function:strtoupper}. * * @param string UTF-8 input string. * @return string Input, in some semblance of upper case. * * @phutil-external-symbol function mb_convert_case */ function phutil_utf8_strtoupper($str) { if (function_exists('mb_convert_case')) { return mb_convert_case($str, MB_CASE_UPPER, 'UTF-8'); } static $map; if ($map === null) { $map = array_combine( range('a', 'z'), range('A', 'Z')); } return phutil_utf8_strtr($str, $map); } /** * Replace characters in a string in a UTF-aware way. Similar to * @{function:strtr}. * * @param string UTF-8 input string. * @param map Map of characters to replace. * @return string Input with translated characters. */ function phutil_utf8_strtr($str, array $map) { $v = phutil_utf8v($str); $result = ''; foreach ($v as $c) { if (isset($map[$c])) { $result .= $map[$c]; } else { $result .= $c; } } return $result; } /** * Determine if a given unicode character is a combining character or not. * * @param string A single unicode character. * @return boolean True or false. */ function phutil_utf8_is_combining_character($character) { $components = phutil_utf8v_codepoints($character); // Combining Diacritical Marks (0300 - 036F). // Combining Diacritical Marks Supplement (1DC0 - 1DFF). // Combining Diacritical Marks for Symbols (20D0 - 20FF). // Combining Half Marks (FE20 - FE2F). foreach ($components as $codepoint) { if ($codepoint >= 0x0300 && $codepoint <= 0x036F || $codepoint >= 0x1DC0 && $codepoint <= 0x1DFF || $codepoint >= 0x20D0 && $codepoint <= 0x20FF || $codepoint >= 0xFE20 && $codepoint <= 0xFE2F) { return true; } } return false; } /** * Split a UTF-8 string into an array of characters. Combining characters * are not split. * * @param string A valid utf-8 string. * @return list A list of characters in the string. */ function phutil_utf8v_combined($string) { $components = phutil_utf8v($string); $array_length = count($components); // If the first character in the string is a combining character, // prepend a space to the string. if ( $array_length > 0 && phutil_utf8_is_combining_character($components[0])) { $string = ' '.$string; $components = phutil_utf8v($string); $array_length++; } for ($index = 1; $index < $array_length; $index++) { if (phutil_utf8_is_combining_character($components[$index])) { $components[$index - 1] = $components[$index - 1].$components[$index]; unset($components[$index]); $components = array_values($components); - $index --; + $index--; $array_length = count($components); } } return $components; }