diff --git a/src/lexer/PhutilLexer.php b/src/lexer/PhutilLexer.php index 67ce6d9..517598f 100644 --- a/src/lexer/PhutilLexer.php +++ b/src/lexer/PhutilLexer.php @@ -1,326 +1,327 @@ array(...), * 'state1' => array(...), * 'state2' => array(...), * ) * * Lexers start at the state named 'start'. Each state should have a list of * rules which can match in that state. A list of rules looks like this: * * array( * array('\s+', 'space'), * array('\d+', 'digit'), * array('\w+', 'word'), * ) * * The lexer operates by processing each rule in the current state in order. * When one matches, it produces a token. For example, the lexer above would * lex this text: * * 3 asdf * * ...to produce these tokens (assuming the rules are for the 'start' state): * * array('digit', '3', null), * array('space', ' ', null), * array('word', 'asdf', null), * * A rule can also cause a state transition: * * array('zebra', 'animal', 'saw_zebra'), * * This would match the text "zebra", emit a token of type "animal", and change * the parser state to "saw_zebra", causing the lexer to start using the rules * from that state. * * To pop the lexer's state, you can use the special state '!pop'. * * Finally, you can provide additional options in the fourth parameter. * Supported options are `case-insensitive` and `context`. * * Possible values for `context` are `push` (push the token value onto the * context stack), `pop` (pop the context stack and use it to provide context * for the token), and `discard` (pop the context stack and throw away the * value). * * For example, to lex text like this: * * Class::CONSTANT * * You can use a rule set like this: * * 'start' => array( * array('\w+(?=::)', 'class', 'saw_class', array('context' => 'push')), * ), * 'saw_class' => array( * array('::', 'operator'), * array('\w+', 'constant, '!pop', array('context' => 'pop')), * ), * * This would parse the above text into this token stream: * * array('class', 'Class', null), * array('operator', '::', null), * array('constant', 'CONSTANT', 'Class'), * * For a concrete implementation, see @{class:PhutilPHPFragmentLexer}. * * @task lexerimpl Lexer Implementation * @task rule Lexer Rules * @task tokens Lexer Tokens * * @group lexer */ abstract class PhutilLexer { private $processedRules; private $lastState; /* -( Lexer Rules )-------------------------------------------------------- */ /** * Return a set of rules for this lexer. See description in * @{class:PhutilLexer}. * * @return dict Lexer rules. * @task lexerimpl */ abstract protected function getRawRules(); /* -( Lexer Rules )-------------------------------------------------------- */ /** * Process, normalize, and validate the raw lexer rules. * * @task rule */ protected function getRules() { $class = get_class($this); $raw_rules = $this->getRawRules(); if (!is_array($raw_rules)) { $type = gettype($raw_rules); throw new UnexpectedValueException( "Expected {$class}->getRawRules() to return array, got {$type}."); } if (empty($raw_rules['start'])) { throw new UnexpectedValueException( "Expected {$class} rules to define rules for state 'start'."); } $processed_rules = array(); foreach ($raw_rules as $state => $rules) { if (!is_array($rules)) { $type = gettype($rules); throw new UnexpectedValueException( "Expected list of rules for state '{$state}' in {$class}, got ". "{$type}."); } foreach ($rules as $key => $rule) { $n = count($rule); if ($n < 2 || $n > 4) { throw new UnexpectedValueException( "Expected rule '{$key}' in state '{$state}' in {$class} to have ". "2-4 elements (regex, token, [next state], [options]), got {$n}."); } $rule = array_values($rule); if (count($rule) == 2) { $rule[] = null; } if (count($rule) == 3) { $rule[] = array(); } foreach ($rule[3] as $option => $value) { switch ($option) { case 'context': if ($value !== 'push' && $value !== 'pop' && $value !== 'discard' && $value !== null) { throw new UnexpectedValueException( "Rule '{$key}' in state '{$state}' in {$class} has unknown ". "context rule '{$value}', expected 'push', 'pop' or ". "'discard'."); } break; default: throw new UnexpectedValueException( "Rule '{$key}' in state '{$state}' in {$class} has unknown ". "option '{$option}'."); } } $flags = 'sS'; // NOTE: The "\G" assertion is an offset-aware version of "^". $rule[0] = '(\\G'.$rule[0].')'.$flags; if (@preg_match($rule[0], '') === false) { $error = error_get_last(); throw new UnexpectedValueException( "Rule '{$key}' in state '{$state}' in {$class} defines an ". "invalid regular expression ('{$rule[0]}'): ". idx($error, 'message')); } $next_state = $rule[2]; if ($next_state !== null && $next_state !== '!pop') { if (empty($raw_rules[$next_state])) { throw new UnexpectedValueException( "Rule '{$key}' in state '{$state}' in {$class} transitions to ". "state '{$next_state}', but there are no rules for that state."); } } $processed_rules[$state][] = $rule; } } return $processed_rules; } /* -( Lexer Tokens )------------------------------------------------------- */ /** * Lex an input string into tokens. * * @param string Input string. * @param string Initial lexer state. * @return list List of lexer tokens. * @task tokens */ public function getTokens($input, $initial_state = 'start') { if (empty($this->processedRules)) { $this->processedRules = $this->getRules(); } $rules = $this->processedRules; $this->lastState = null; $position = 0; $length = strlen($input); $tokens = array(); $states = array(); $states[] = 'start'; if ($initial_state != 'start') { $states[] = $initial_state; } $context = array(); while ($position < $length) { $state_rules = idx($rules, end($states), array()); foreach ($state_rules as $rule) { + $matches = null; if (!preg_match($rule[0], $input, $matches, 0, $position)) { continue; } list($regexp, $token_type, $next_state, $options) = $rule; $match_length = strlen($matches[0]); if (!$match_length) { if ($next_state === null) { throw new UnexpectedValueException( "Rule '{$regexp}' matched a zero-length token and causes no ". "state transition."); } } else { $position += $match_length; $token = array($token_type, $matches[0]); $copt = idx($options, 'context'); if ($copt == 'push') { $context[] = $matches[0]; $token[] = null; } else if ($copt == 'pop') { if (empty($context)) { throw new UnexpectedValueException( "Rule '{$regexp}' popped empty context!"); } $token[] = array_pop($context); } else if ($copt == 'discard') { if (empty($context)) { throw new UnexpectedValueException( "Rule '{$regexp}' discarded empty context!"); } array_pop($context); $token[] = null; } else { $token[] = null; } $tokens[] = $token; } if ($next_state !== null) { if ($next_state == '!pop') { array_pop($states); if (empty($states)) { throw new UnexpectedValueException( "Rule '{$regexp}' popped off the last state."); } } else { $states[] = $next_state; } } continue 2; } throw new UnexpectedValueException( "No lexer rule matched input at char {$position}."); } $this->lastState = $states; return $tokens; } /** * Merge adjacent tokens of the same type. For example, if a comment is * tokenized as <"//", "comment">, this method will merge the two tokens into * a single combined token. */ public function mergeTokens(array $tokens) { $last = null; $result = array(); foreach ($tokens as $token) { if ($last === null) { $last = $token; continue; } if (($token[0] == $last[0]) && ($token[2] == $last[2])) { $last[1] .= $token[1]; } else { $result[] = $last; $last = $token; } } if ($last !== null) { $result[] = $last; } return $result; } public function getLexerState() { return $this->lastState; } } diff --git a/src/lexer/PhutilSimpleOptionsLexer.php b/src/lexer/PhutilSimpleOptionsLexer.php index 3fb6f34..7ed7f27 100644 --- a/src/lexer/PhutilSimpleOptionsLexer.php +++ b/src/lexer/PhutilSimpleOptionsLexer.php @@ -1,89 +1,91 @@ getTokens($input); foreach ($tokens as $key => $token) { list($type, $value) = $token; switch ($type) { case 'esc': $tokens[$key][0] = 'word'; $tokens[$key][1] = substr($value, 1); break; } } $tokens = $this->mergeTokens($tokens); // Find spaces in between two words and turn them into words. This allows // us to parse unescaped spaces in values correctly. for ($ii = 0; $ii < count($tokens); $ii++) { list($type, $value) = $tokens[$ii]; if ($type != ' ') { continue; } $last = idx($tokens, $ii - 1); if (!$last) { continue; } $next = idx($tokens, $ii + 1); if (!$next) { continue; } if ($last[0] == 'word' && $next[0] == 'word') { $tokens[$ii][0] = 'word'; } } // NOTE: Strip these only after merging tokens, so "a b" merges into two // words, "a" and "b", not a single "ab" word. foreach ($tokens as $key => $token) { list($type, $value) = $token; switch ($type) { case "'": case '"': case ' ': unset($tokens[$key]); break; } } return array_values($tokens); } protected function getRawRules() { return array( 'start' => array( array('\s+', ' '), array("'", "'", 'string1'), array('"', '"', 'string2'), array(',', ','), array('=', '='), array('[^\\s\'"=,]+', 'word'), ), 'string1' => array( array('[^\'\\\\]+', 'word'), array("'", "'", '!pop'), array('\\\\.', 'esc'), + array('\\\\$', '!pop'), ), 'string2' => array( array('[^"\\\\]+', 'word'), array('"', '"', '!pop'), array('\\\\.', 'esc'), + array('\\\\$', '!pop'), ), ); } } diff --git a/src/parser/__tests__/PhutilSimpleOptionsTestCase.php b/src/parser/__tests__/PhutilSimpleOptionsTestCase.php index 3ccab78..67202de 100644 --- a/src/parser/__tests__/PhutilSimpleOptionsTestCase.php +++ b/src/parser/__tests__/PhutilSimpleOptionsTestCase.php @@ -1,128 +1,147 @@ array(), // Basic parsing. 'legs=4' => array('legs' => '4'), 'legs=4,eyes=2' => array('legs' => '4', 'eyes' => '2'), // Repeated keys mean last specification wins. 'legs=4,legs=3' => array('legs' => '3'), // Keys with no value should map to true. 'flag' => array('flag' => true), 'legs=4,flag' => array('legs' => '4', 'flag' => true), // Leading and trailing spaces should be ignored. ' flag ' => array('flag' => true), ' legs = 4 , eyes = 2' => array('legs' => '4', 'eyes' => '2'), // Unescaped spaces inside values are OK. 'legs=a b c d' => array('legs' => 'a b c d'), // Case should be ignored. 'LEGS=4' => array('legs' => '4'), 'legs=4, LEGS=4' => array('legs' => '4'), // Empty values should be absent. 'legs=' => array(), 'legs=4,legs=,eyes=2' => array('eyes' => '2'), // Quoted values should allow parsing comma, equals, etc. 'punctuation=",="' => array('punctuation' => ',='), // Quoted keys can also have that stuff. '"backslash\\\\quote\\""=1' => array('backslash\\quote"' => '1'), ' "," = "," , "=" = "=" ' => array(',' => ',', '=' => '='), // Strings like this should not parse as simpleoptions. 'SELECT id, name, size FROM table' => array(), '"a""b"' => array(), '=a' => array(), ',a' => array(), 'a==' => array(), 'a=b=' => array(), ); foreach ($map as $string => $expect) { $parser = new PhutilSimpleOptions(); $this->assertEqual( $expect, $parser->parse($string), "Correct parse of '{$string}'"); } } public function testSimpleOptionsCaseParse() { $map = array( 'legs=4, LEGS=8, LeGs' => array( 'legs' => '4', 'LEGS' => '8', 'LeGs' => true, ), ); foreach ($map as $string => $expect) { $parser = new PhutilSimpleOptions(); $parser->setCaseSensitive(true); $this->assertEqual( $expect, $parser->parse($string), "Correct case-sensitive parse of '{$string}'"); } } + public function testSimpleOptionsUnterminatedStrings() { + $list = array( + '"', + "'", + 'a="', + "a='", + 'a="\\', + "a='\\", + ); + + foreach ($list as $input) { + $parser = new PhutilSimpleOptions(); + $this->assertEqual( + array(), + $parser->parse($input), + "Correct failing parse of invalid input: {$input}"); + } + } + public function testSimpleOptionsUnparse() { $map = array( '' => array(), 'legs=4' => array('legs' => '4'), 'legs=4, eyes=2' => array('legs' => '4', 'eyes' => '2'), 'eyes=2, legs=4' => array('eyes' => '2', 'legs' => '4'), 'legs=4, head' => array('legs' => '4', 'head' => true), 'eyes=2' => array('legs' => '', 'eyes' => '2'), '"thousands separator"=","' => array('thousands separator' => ','), ); foreach ($map as $expect => $dict) { $parser = new PhutilSimpleOptions(); $this->assertEqual( $expect, $parser->unparse($dict), "Correct unparse of ".print_r($dict, true)); } $bogus = array( array('' => ''), array('' => 'x'), ); foreach ($bogus as $bad_input) { $caught = null; try { $parser = new PhutilSimpleOptions(); $parser->unparse($bad_input); } catch (Exception $ex) { $caught = $ex; } $this->assertEqual( true, $caught instanceof Exception, "Correct throw on unparse of bad input."); } $parser = new PhutilSimpleOptions(); $this->assertEqual( 'a="\\}"', $parser->unparse(array('a' => '}'), '}'), "Unparse with extra escape."); } }