#!/usr/bin/env php setTagline('utf8 charset test script'); $args->setSynopsis(<<parseStandardArguments(); $args->parse(array( array( 'name' => 'context', 'short' => 'C', 'param' => 'lines', 'default' => 3, 'help' => 'Show __lines__ lines of context instead of the default 3.', 'conflicts' => array( 'test' => 'with --test, context is not shown.', ), ), array( 'name' => 'test', 'short' => 't', 'help' => 'Print file names containing invalid UTF-8 to stdout.', ), array( 'name' => 'files', 'wildcard' => true, ), )); $is_test = $args->getArg('test'); $context = $args->getArg('context'); $files = $args->getArg('files'); if (empty($files)) { $args->printHelpAndExit(); } if ($is_test) { $err = test($files); } else { $err = show($files, $context); } exit($err); function read($file) { if ($file == '-') { return file_get_contents('php://stdin'); } else { return Filesystem::readFile($file); } } function name($file) { if ($file == '-') { return 'stdin'; } else { return $file; } } function test(array $files) { foreach ($files as $file) { $data = read($file); if (!phutil_is_utf8($data)) { echo name($file)."\n"; } } return 0; } function show(array $files, $context) { foreach ($files as $file) { $data = read($file); $ok = phutil_is_utf8($data); if ($ok) { echo "OKAY"; } else { echo "FAIL"; } echo " ".name($file)."\n"; if (!$ok) { $lines = explode("\n", $data); $len = count($lines); $map = array(); $bad = array(); foreach ($lines as $n => $line) { if (phutil_is_utf8($line)) { continue; } $bad[$n] = true; for ($jj = max(0, $n - $context); $jj < min($len, $n + 1 + $context); $jj++) { $map[$jj] = true; } } $width = strlen(max(array_keys($map))); // Set $last such that we print a newline on the first iteration thorugh // the loop. $last = -2; foreach ($map as $idx => $ignored) { if ($idx != $last + 1) { printf("\n"); } $last = $idx; $line = $lines[$idx]; if (!empty($bad[$idx])) { $line = show_problems($line); } printf(" % {$width}d %s\n", $idx + 1, $line); } echo "\n"; } } } function show_problems($line) { $regex = "/^(". "[\x01-\x7F]+". "|([\xC2-\xDF][\x80-\xBF])". "|([\xE0-\xEF][\x80-\xBF][\x80-\xBF])". "|([\xF0-\xF4][\x80-\xBF][\x80-\xBF][\x80-\xBF]))/"; $out = ''; while (strlen($line)) { $match = null; if (preg_match($regex, $line, $match)) { $out .= $match[1]; $line = substr($line, strlen($match[1])); } else { $chr = sprintf("<0x%0X>", ord($line[0])); $chr = phutil_console_format('##%s##', $chr); $out .= $chr; $line = substr($line, 1); } } return $out; }