diff --git a/src/parser/PhutilParserGenerator.php b/src/parser/PhutilParserGenerator.php index 7c04281..c3a28b7 100644 --- a/src/parser/PhutilParserGenerator.php +++ b/src/parser/PhutilParserGenerator.php @@ -1,883 +1,884 @@ setTerminals(array('a', 'b')) * ->setStartRule('S') * ->setRules( * array( * 'S' => 'A b', * 'A' => array( * 'A a', * 'a', * ))) * ->processGrammar(); * * To actually parse token streams, use @{method:parseTokens}. * * $tokens = get_tokens(); // Usually from PhutilLexer * $callback = 'some_callback'; * $tree = $parser->parseTokens($tokens, $callback); * * The callback is invoked when a grammar rule matches. It should have this * signature: * * function parser_callback($rule, $production, array $tokens) { * // ... * } * * The `$rule` is the matching rule; the `$production` is the matching * production, and `$tokens` is the matching tokens (for terminal rules) or the * return value of previous parse callbacks (for nonterminal rules). * * You should either return a result of evaluation, or some sort of abstract * representation of the parse tree (this is more likely to be useful for more * complex grammars). * * NOTE: This class generates LR(1) parsers, which perform less-than-optimally * on large grammars. Worse, it is written in PHP. It is suitable only for * very simple grammars with few states. * * NOTE: These parsers silently resolve reduce/reduce conflicts by choosing the * first reduction, and silently resolve shift/reduce conflicts by shifting. * These are the same rules used by Yacc, but are implicit. * * @task rules Grammar Rules * @task rvalidation Rule Validation * @task first Computing First() * @task tables Computing Action and Goto Tables * @task inspect Inspecting Generator State */ final class PhutilParserGenerator { private $terminals; private $rules; private $startRule = 'start'; private $sets = array(); private $successor = array(); private $setHashes = array(); private $actionTable; private $gotoTable; private $rulesValidated = false; private $eofSymbol; private $initSymbol; private $epsilonSymbol; private $endSymbol; private $firstTable; public function processGrammar() { $this->validateRules(); $this->buildFirstTable(); $init = $this->getInitSymbol(); $eof = $this->getEOFSymbol(); $end = $this->getEndSymbol(); $this->rules[$init] = array( array($this->startRule, $end), ); list($is_new, $state) = $this->addState( array( array($this->getInitSymbol(), 0, 0, $eof), )); $this->buildSuccessors($state); $this->buildTables(); return $this; } /* -( Grammar Rules )------------------------------------------------------ */ public function setTerminals(array $terminals) { $this->terminals = array_fill_keys($terminals, true); return $this; } public function setRules(array $rules) { $this->rules = $rules; return $this; } public function setStartRule($rule_name) { $this->startRule = $rule_name; return $this; } public function getStartRule() { return $this->startRule; } public function getEOFSymbol() { if ($this->eofSymbol === null) { throw new Exception('Call processGrammar() before getEOFSymbol()!'); } return $this->eofSymbol; } public function getInitSymbol() { if ($this->initSymbol === null) { throw new Exception('Call processGrammar() before getInitSymbol()!'); } return $this->initSymbol; } public function getEpsilonSymbol() { if ($this->epsilonSymbol === null) { throw new Exception('Call processGrammar() before getEpsilonSymbol()!'); } return $this->epsilonSymbol; } public function getEndSymbol() { if ($this->endSymbol === null) { throw new Exception('Call processGrammar() before getEndSymbol()!'); } return $this->endSymbol; } public function isTerminal($symbol) { return isset($this->terminals[$symbol]); } public function isRule($symbol) { return isset($this->rules[$symbol]); } /* -( Rule Validation )---------------------------------------------------- */ /** * Perform a battery of tests on the provided rules to detect problems which * would prevent us from generating a parser. * * @return void * @task rvalidation */ private function validateRules() { // Rules must be specified in the right format. $this->parseRules(); // Rules must contain only known symbols. $this->validateRuleSymbols(); // The start rule must exist and be valid. $this->validateStartRule(); // Now, we select printable names for special symbols (EOF, epsilon, etc) // that don't conflict with any symbols in the grammar. $this->chooseSpecialSymbols(); // Make sure every terminal can be reached by some rule. $this->validateAllTerminalsReachable(); // Make sure every rule can be reached. $this->validateAllRulesReachable(); // Make sure every rule has some valid reduction. $this->validateAllRulesReducible(); $this->rulesValidated = true; } /** * @task rvalidation */ private function parseRules() { foreach ($this->rules as $rule_name => $rule_variants) { if (!is_array($rule_variants)) { $rule_variants = array($rule_variants); $this->rules[$rule_name] = $rule_variants; } foreach ($rule_variants as $vkey => $variant) { if ($variant === null) { $variant = array(null); } else if (!is_array($variant)) { $variant = preg_split('/\s+/', $variant); } else { foreach ($variant as $symbol) { if (($symbol === null) && count($variant) > 1) { throw new PhutilInvalidRuleParserGeneratorException( "Rule '{$rule_name}' contains a production '{$vkey}' which ". "is nonempty but has a null in it. A rule with other symbols ". "may not contain null."); } } } $this->rules[$rule_name][$vkey] = array_values($variant); } } } /** * @task rvalidation */ private function validateRuleSymbols() { foreach ($this->rules as $rule => $productions) { foreach ($productions as $production_name => $production) { foreach ($production as $symbol) { if ($symbol === null) { continue; } if ($this->isTerminal($symbol)) { continue; } if ($this->isRule($symbol)) { continue; } $production_string = implode(' ', $production); throw new PhutilUnknownSymbolParserGeneratorException( "Symbol '{$symbol}' in production '{$production_name}' ". "('{$production_string}') of rule '{$rule}' does not name a rule ". "or terminal. Did you misspell a symbol, fail to specify a ". "terminal, or forget a rule?"); } } } } /** * @task rvalidation */ private function validateStartRule() { $start_rule = $this->getStartRule(); if (!$this->isRule($start_rule)) { throw new PhutilUnknownSymbolParserGeneratorException( "Start rule '{$start_rule}' does not appear in the rules for the ". "grammar. Use setStartRule() to choose a different start rule, or ". "add a rule named '{$start_rule}'."); } } /** * @task rvalidation */ private function chooseSpecialSymbols() { $special = array( 'eofSymbol' => '(end-of-file)', 'epsilonSymbol' => '(epsilon)', 'initSymbol' => '(init)', 'endSymbol' => '(end)', ); foreach ($special as $key => $value) { while ($this->isRule($value) || $this->isTerminal($value)) { $value .= "'"; } $special[$key] = $value; } $this->eofSymbol = $special['eofSymbol']; $this->epsilonSymbol = $special['epsilonSymbol']; $this->initSymbol = $special['initSymbol']; $this->endSymbol = $special['endSymbol']; foreach ($this->rules as $rule => $productions) { foreach ($productions as $production_name => $production) { foreach ($production as $key => $symbol) { if ($symbol === null) { $this->rules[$rule][$production_name][$key] = $this->epsilonSymbol; } } $this->rules[$rule][$production_name][] = $this->endSymbol; } } $this->terminals[$this->getEOFSymbol()] = true; } /** * @task rvalidation */ private function validateAllTerminalsReachable() { $seen = array(); foreach ($this->rules as $rule => $productions) { foreach ($productions as $production) { foreach ($production as $symbol) { $seen[$symbol] = true; } } } $missing = array_diff_key($this->terminals, $seen); unset($missing[$this->getEOFSymbol()]); if ($missing) { $missing_terminals = array_keys($missing); $missing_terminals = implode(', ', $missing_terminals); throw new PhutilUnreachableTerminalParserGeneratorException( 'Some terminals do not appear in any rule: '. $missing_terminals); } } /** * @task rvalidation */ private function validateAllRulesReachable() { $stack = array(); $reachable = $this->computeReachableRules($this->getStartRule(), $stack); $missing = array_diff_key($this->rules, $reachable); unset($missing[$this->getStartRule()]); if ($missing) { $missing_rules = array_keys($missing); $missing_rules = implode(', ', $missing_rules); throw new PhutilUnreachableRuleParserGeneratorException( 'Some rules can never be reached from any production: '. $missing_rules); } } /** * @task rvalidation */ private function computeReachableRules($rule, array &$stack) { if (isset($stack[$rule])) { return $stack[$rule]; } $stack[$rule] = array(); foreach ($this->rules[$rule] as $production) { foreach ($production as $symbol) { if ($this->isRule($symbol)) { $stack[$rule][$symbol] = true; $stack[$rule] += $this->computeReachableRules($symbol, $stack); } } } return $stack[$rule]; } /** * @task rvalidation */ private function validateAllRulesReducible() { $reducible = array(); foreach ($this->rules as $rule => $productions) { if (!$this->isRuleReducible($rule, $reducible)) { throw new PhutilIrreducibleRuleParserGeneratorException( "Rule '{$rule}' can never be reduced: it recurses indefinitely ". "and reaches no production of terminals."); } } } /** * @task rvalidation */ private function isRuleReducible($rule, array &$reducible) { if (isset($reducible[$rule])) { return $reducible[$rule]; } // Set this ahead of time so we don't end up in an infinite loop if // rules recurse. We'll overwrite it if we find a reduction. $reducible[$rule] = false; $reducible[$rule] = $this->computeRuleReducible($rule, $reducible); return $reducible[$rule]; } /** * @task rvalidation */ private function computeRuleReducible($rule, array &$reducible) { $epsilon = $this->getEpsilonSymbol(); $end = $this->getEndSymbol(); $productions = $this->rules[$rule]; // In the first pass, try to find a trivially reducible production, e.g. one // with epsilon or only terminals. Also, remove recursive productions (those // which directly involve the rule itself) because we know we won't be able // to reduce them. If we're lucky, this will allow us to determine that the // rule is reducible without recursion. For example, we can immediately // reduce these productions: // // R -> a // R -> b c d // R -> (epsilon) // // We can never reduce these productions: // // R -> R // R -> a R b // // We might be able to reduce these productions, but they aren't as cheap // or easy to figure out, since we need to first determine if other rules // can be reduced: // // R -> X Y // R -> X a // // If we find a reduction, we return immediately. foreach ($productions as $key => $production) { $has_only_terminals = true; foreach ($production as $symbol) { if ($symbol == $end) { break; } else if ($symbol == $epsilon) { // The rule contains an epsilon production, which can always reduce // it. return true; } else if ($symbol == $rule) { // The rule contains itself; this production is never reducible. We // must find another reducible production. unset($productions[$key]); continue 2; } else if ($this->isTerminal($symbol)) { // This is a terminal; keep looking. We'll be able to reduce the // production if it contains only terminals. continue; } else { // This is a rule, so we can't trivially reduce it. We'll keep it // for the next round if we can't find any trivial reductions. $has_only_terminals = false; break; } } if ($has_only_terminals) { return true; } } // If we have no productions left, this rule can't be reduced. if (empty($productions)) { return false; } // We have remaining productions which include other rules. Look for a // nontrivial reduction. For example: // // R -> X Y // X -> x // Y -> y // // In this case, X and Y are both reducible, so "X Y" is reducible and thus // R is reducible. foreach ($productions as $production) { $can_reduce = true; foreach ($production as $symbol) { // NOTE: We don't need to check for epsilon here, because we would // already have determined the rule was reducible if we had an epsilon // production. if ($symbol == $end) { break; } else if ($this->isTerminal($symbol)) { continue; } else if (!$this->isRuleReducible($symbol, $reducible)) { $can_reduce = false; break; } } if ($can_reduce) { // The production contained only terminals and reducible rules, so it // is reducible. We're good and don't need to examine remaining // productions. return true; } } // We didn't find any reducible productions. return false; } /* -( Computing First() )-------------------------------------------------- */ private function buildFirstTable() { $this->firstTable = array(); foreach ($this->rules as $rule => $productions) { $this->buildRuleFirst($rule); } } private function buildRuleFirst($rule) { if (isset($this->firstTable[$rule])) { return $this->firstTable[$rule]; } $this->firstTable[$rule] = array(); $productions = $this->rules[$rule]; foreach ($productions as $key => $production) { $this->firstTable[$rule] += $this->getFirstForProduction($production); } return $this->firstTable[$rule]; } private function getFirstForProduction(array $production) { $set = array(); $end = $this->getEndSymbol(); $epsilon = $this->getEpsilonSymbol(); $eof = $this->getEOFSymbol(); $accept_epsilon = true; foreach ($production as $symbol) { if ($symbol === $end) { break; } else if ($symbol === $epsilon) { break; } else if ($this->isTerminal($symbol)) { $set[$symbol] = true; $accept_epsilon = false; break; } else { $symbol_set = $this->buildRuleFirst($symbol); $has_epsilon = isset($symbol_set[$epsilon]); unset($symbol_set[$epsilon]); $set += $symbol_set; if (!$has_epsilon) { $accept_epsilon = false; break; } } } if ($accept_epsilon) { $set[$epsilon] = true; } return $set; } /* -( Computing States )--------------------------------------------------- */ private function addState(array $set) { $seen = array(); foreach ($set as $item) { $seen[$item[0]][$item[1]][$item[2]][$item[3]] = true; } $end = $this->getEndSymbol(); $epsilon = $this->getEpsilonSymbol(); for ($ii = 0; $ii < count($set); $ii++) { $item = $set[$ii]; $production = $this->rules[$item[0]][$item[1]]; $next = $production[$item[2]]; if ($this->isTerminal($next)) { continue; } else if ($next === $epsilon) { continue; } else if ($next === $end) { continue; } $v = array_slice($production, $item[2] + 1, -1); $v[] = $item[3]; $v[] = $end; $firsts = $this->getFirstForProduction($v); foreach ($firsts as $nfirst => $ignored) { if (!$this->isTerminal($nfirst)) { unset($firsts[$nfirst]); } } foreach ($this->rules[$next] as $pkey => $nproduction) { foreach ($firsts as $nfirst => $ignored) { if (isset($seen[$next][$pkey][0][$nfirst])) { continue; } $set[] = array($next, $pkey, 0, $nfirst); $seen[$next][$pkey][0][$nfirst] = true; } } } $hash = $this->hashSet($set); if (isset($this->setHashes[$hash])) { return array(false, $this->setHashes[$hash]); } $this->states[] = $set; $state = last_key($this->states); $this->setHashes[$hash] = $state; return array(true, $state); } private function buildSuccessors($start_state) { $end = $this->getEndSymbol(); $nexts = array(); foreach ($this->states[$start_state] as $item) { $next = $this->rules[$item[0]][$item[1]][$item[2]]; if ($next === $end) { continue; } $nexts[$next][] = array( $item[0], $item[1], $item[2] + 1, $item[3], ); } foreach ($nexts as $next => $items) { list($is_new, $state) = $this->addState($items); $this->successor[$start_state][$next] = $state; if ($is_new) { $this->buildSuccessors($state); } } } private function hashSet(array $set) { foreach ($set as $k => $item) { $set[$k] = implode("\0", $item); } sort($set); $set = implode("\1", $set); return md5($set); } private function buildTables() { $action = array(); $goto = array(); $end = $this->getEndSymbol(); $eof = $this->getEOFSymbol(); $init = $this->getInitSymbol(); foreach ($this->states as $state => $items) { $shift = array(); $reduce = array(); $accept = false; foreach ($items as $item) { $next = $this->rules[$item[0]][$item[1]][$item[2]]; if ($next == $end) { if ($item[0] !== $init) { $reduce[$item[3]][] = $item; } else if ($item[0] === $init && $item[3] === $eof) { $accept = $item; } } else if ($this->isTerminal($next)) { $shift[$next] = $item; } else { $goto[$state][$next] = $this->successor[$state][$next]; } } foreach ($reduce as $next => $reductions) { if (count($reductions) > 1) { $ways = array(); foreach ($reductions as $reduction) { $ways[] = "{$reduction[0]}/{$reduction[1]}"; } $ways = implode('; ', $ways); // TODO: As below, we should have more explicit handling of // reduce/reduce conflicts. For now, just pick the first one. if (false) { throw new Exception( "Reduce/reduce conflict: from state '{$state}', when a ". "'{$next}' is encountered, it may be reduced in multiple ". "ways: {$ways}"); } } $reduce[$next] = head($reductions); } $srconflicts = array_intersect_key($shift, $reduce); foreach ($srconflicts as $next => $ignored) { // TODO: We should probably have better or more explicit handling of // shift/reduce conflicts. For now, we just shift. if (false) { $what = $reduce[$next][0]; throw new Exception( "Shift/reduce conflict: from state '{$state}', when a '{$next}' ". "is encountered, shifting conflicts with reducing '{$what}'."); } else { // Resolve the shift/reduce by shifting. $reduce = array(); } } if ($accept && isset($shift[$eof])) { throw new Exception('Accept/shift conflict!'); } if ($accept && isset($reduce[$eof])) { throw new Exception('Accept/reduce conflict!'); } foreach ($reduce as $next => $item) { $action[$state][$next] = array( 'R', array( $item[0], $item[1], - count($this->rules[$item[0]][$item[1]]) - 1), - ); + count($this->rules[$item[0]][$item[1]]) - 1, + ), + ); } foreach ($shift as $next => $item) { $action[$state][$next] = array( 'S', $this->successor[$state][$next], ); } if ($accept) { $action[$state][$eof] = array('A'); } } $this->actionTable = $action; $this->gotoTable = $goto; } public function generateParserFunction($name) { $out = array(); $out[] = 'function '.$name.'(array $tokens, $callback) {'; $out[] = ' return PhutilParserGenerator::parseTokensWithTables('; $out[] = ' '.$this->formatAndIndent($this->actionTable, 4).','; $out[] = ' '.$this->formatAndIndent($this->gotoTable, 4).','; $out[] = ' '.$this->formatAndIndent($this->getEOFSymbol(), 4).','; $out[] = ' $tokens,'; $out[] = ' $callback);'; $out[] = '}'; return implode("\n", $out); } private function formatAndIndent($var, $depth) { $var = phutil_var_export($var); $var = str_replace("\n", "\n".str_repeat(' ', $depth), $var); return $var; } public function parseTokens(array $tokens, $callback) { return self::parseTokensWithTables( $this->actionTable, $this->gotoTable, $this->getEOFSymbol(), $tokens, $callback); } public static function parseTokensWithTables( $action_table, $goto_table, $eof_symbol, array $tokens, $callback) { $state_stack = array(0); $token_stack = array(); $tokens = array_reverse($tokens); while (true) { $state = end($state_stack); if (empty($tokens)) { $next = $eof_symbol; } else { $next_token = end($tokens); $next = $next_token[0]; } if (!isset($action_table[$state][$next])) { $expected = implode(', ', array_keys($action_table[$state])); throw new Exception( "Unexpected '{$next}' in state {$state}! Expected: ". $expected); } $action = $action_table[$state][$next]; switch ($action[0]) { case 'S': $state_stack[] = $action[1]; $token_stack[] = array_pop($tokens); break; case 'R': $r_rule = $action[1][0]; $r_prod = $action[1][1]; $r_size = $action[1][2]; $token_v = array(); while ($r_size--) { $token_v[] = array_pop($token_stack); array_pop($state_stack); } $token_v = array_reverse($token_v); $token_stack[] = call_user_func_array( $callback, array($r_rule, $r_prod, $token_v)); $goto = $goto_table[end($state_stack)][$r_rule]; $state_stack[] = $goto; break; case 'A': break 2; } } return head($token_stack); } /* -( Inspecting Generator State )----------------------------------------- */ /** * @task inspect */ public function inspectRules() { if (!$this->rulesValidated) { throw new Exception('Call processGrammar() before inspectRules()!'); } return $this->rules; } /** * @task inspect */ public function inspectFirstTable() { if ($this->firstTable === null) { throw new Exception('Call processGrammar() before inspectFirstTable()!'); } return $this->firstTable; } }