Page MenuHomec4science

PhutilParserGenerator.php
No OneTemporary

File Metadata

Created
Sun, May 19, 12:45

PhutilParserGenerator.php

<?php
/**
* Simple LR(1) parser generator. Generally, build a parser by setting terminals
* and rules, then calling @{method:processGrammar}. For example, here is a
* simple grammar which accepts one or more "a" followed by exactly one "b":
*
* $parser = id(new PhutilParserGenerator())
* ->setTerminals(array('a', 'b'))
* ->setStartRule('S')
* ->setRules(
* array(
* 'S' => 'A b',
* 'A' => array(
* 'A a',
* 'a',
* )))
* ->processGrammar();
*
* To actually parse token streams, use @{method:parseTokens}.
*
* $tokens = get_tokens(); // Usually from PhutilLexer
* $callback = 'some_callback';
* $tree = $parser->parseTokens($tokens, $callback);
*
* The callback is invoked when a grammar rule matches. It should have this
* signature:
*
* function parser_callback($rule, $production, array $tokens) {
* // ...
* }
*
* The `$rule` is the matching rule; the `$production` is the matching
* production, and `$tokens` is the matching tokens (for terminal rules) or the
* return value of previous parse callbacks (for nonterminal rules).
*
* You should either return a result of evaluation, or some sort of abstract
* representation of the parse tree (this is more likely to be useful for more
* complex grammars).
*
* NOTE: This class generates LR(1) parsers, which perform less-than-optimally
* on large grammars. Worse, it is written in PHP. It is suitable only for
* very simple grammars with few states.
*
* NOTE: These parsers silently resolve reduce/reduce conflicts by choosing the
* first reduction, and silently resolve shift/reduce conflicts by shifting.
* These are the same rules used by Yacc, but are implicit.
*
* @task rules Grammar Rules
* @task rvalidation Rule Validation
* @task first Computing First()
* @task tables Computing Action and Goto Tables
* @task inspect Inspecting Generator State
*/
final class PhutilParserGenerator {
private $terminals;
private $rules;
private $startRule = 'start';
private $sets = array();
private $successor = array();
private $setHashes = array();
private $actionTable;
private $gotoTable;
private $rulesValidated = false;
private $eofSymbol;
private $initSymbol;
private $epsilonSymbol;
private $endSymbol;
private $firstTable;
public function processGrammar() {
$this->validateRules();
$this->buildFirstTable();
$init = $this->getInitSymbol();
$eof = $this->getEOFSymbol();
$end = $this->getEndSymbol();
$this->rules[$init] = array(
array($this->startRule, $end),
);
list($is_new, $state) = $this->addState(
array(
array($this->getInitSymbol(), 0, 0, $eof),
));
$this->buildSuccessors($state);
$this->buildTables();
return $this;
}
/* -( Grammar Rules )------------------------------------------------------ */
public function setTerminals(array $terminals) {
$this->terminals = array_fill_keys($terminals, true);
return $this;
}
public function setRules(array $rules) {
$this->rules = $rules;
return $this;
}
public function setStartRule($rule_name) {
$this->startRule = $rule_name;
return $this;
}
public function getStartRule() {
return $this->startRule;
}
public function getEOFSymbol() {
if ($this->eofSymbol === null) {
throw new Exception('Call processGrammar() before getEOFSymbol()!');
}
return $this->eofSymbol;
}
public function getInitSymbol() {
if ($this->initSymbol === null) {
throw new Exception('Call processGrammar() before getInitSymbol()!');
}
return $this->initSymbol;
}
public function getEpsilonSymbol() {
if ($this->epsilonSymbol === null) {
throw new Exception('Call processGrammar() before getEpsilonSymbol()!');
}
return $this->epsilonSymbol;
}
public function getEndSymbol() {
if ($this->endSymbol === null) {
throw new Exception('Call processGrammar() before getEndSymbol()!');
}
return $this->endSymbol;
}
public function isTerminal($symbol) {
return isset($this->terminals[$symbol]);
}
public function isRule($symbol) {
return isset($this->rules[$symbol]);
}
/* -( Rule Validation )---------------------------------------------------- */
/**
* Perform a battery of tests on the provided rules to detect problems which
* would prevent us from generating a parser.
*
* @return void
* @task rvalidation
*/
private function validateRules() {
// Rules must be specified in the right format.
$this->parseRules();
// Rules must contain only known symbols.
$this->validateRuleSymbols();
// The start rule must exist and be valid.
$this->validateStartRule();
// Now, we select printable names for special symbols (EOF, epsilon, etc)
// that don't conflict with any symbols in the grammar.
$this->chooseSpecialSymbols();
// Make sure every terminal can be reached by some rule.
$this->validateAllTerminalsReachable();
// Make sure every rule can be reached.
$this->validateAllRulesReachable();
// Make sure every rule has some valid reduction.
$this->validateAllRulesReducible();
$this->rulesValidated = true;
}
/**
* @task rvalidation
*/
private function parseRules() {
foreach ($this->rules as $rule_name => $rule_variants) {
if (!is_array($rule_variants)) {
$rule_variants = array($rule_variants);
$this->rules[$rule_name] = $rule_variants;
}
foreach ($rule_variants as $vkey => $variant) {
if ($variant === null) {
$variant = array(null);
} else if (!is_array($variant)) {
$variant = preg_split('/\s+/', $variant);
} else {
foreach ($variant as $symbol) {
if (($symbol === null) && count($variant) > 1) {
throw new PhutilInvalidRuleParserGeneratorException(
"Rule '{$rule_name}' contains a production '{$vkey}' which ".
"is nonempty but has a null in it. A rule with other symbols ".
"may not contain null.");
}
}
}
$this->rules[$rule_name][$vkey] = array_values($variant);
}
}
}
/**
* @task rvalidation
*/
private function validateRuleSymbols() {
foreach ($this->rules as $rule => $productions) {
foreach ($productions as $production_name => $production) {
foreach ($production as $symbol) {
if ($symbol === null) {
continue;
}
if ($this->isTerminal($symbol)) {
continue;
}
if ($this->isRule($symbol)) {
continue;
}
$production_string = implode(' ', $production);
throw new PhutilUnknownSymbolParserGeneratorException(
"Symbol '{$symbol}' in production '{$production_name}' ".
"('{$production_string}') of rule '{$rule}' does not name a rule ".
"or terminal. Did you misspell a symbol, fail to specify a ".
"terminal, or forget a rule?");
}
}
}
}
/**
* @task rvalidation
*/
private function validateStartRule() {
$start_rule = $this->getStartRule();
if (!$this->isRule($start_rule)) {
throw new PhutilUnknownSymbolParserGeneratorException(
"Start rule '{$start_rule}' does not appear in the rules for the ".
"grammar. Use setStartRule() to choose a different start rule, or ".
"add a rule named '{$start_rule}'.");
}
}
/**
* @task rvalidation
*/
private function chooseSpecialSymbols() {
$special = array(
'eofSymbol' => '(end-of-file)',
'epsilonSymbol' => '(epsilon)',
'initSymbol' => '(init)',
'endSymbol' => '(end)',
);
foreach ($special as $key => $value) {
while ($this->isRule($value) || $this->isTerminal($value)) {
$value .= "'";
}
$special[$key] = $value;
}
$this->eofSymbol = $special['eofSymbol'];
$this->epsilonSymbol = $special['epsilonSymbol'];
$this->initSymbol = $special['initSymbol'];
$this->endSymbol = $special['endSymbol'];
foreach ($this->rules as $rule => $productions) {
foreach ($productions as $production_name => $production) {
foreach ($production as $key => $symbol) {
if ($symbol === null) {
$this->rules[$rule][$production_name][$key] = $this->epsilonSymbol;
}
}
$this->rules[$rule][$production_name][] = $this->endSymbol;
}
}
$this->terminals[$this->getEOFSymbol()] = true;
}
/**
* @task rvalidation
*/
private function validateAllTerminalsReachable() {
$seen = array();
foreach ($this->rules as $rule => $productions) {
foreach ($productions as $production) {
foreach ($production as $symbol) {
$seen[$symbol] = true;
}
}
}
$missing = array_diff_key($this->terminals, $seen);
unset($missing[$this->getEOFSymbol()]);
if ($missing) {
$missing_terminals = array_keys($missing);
$missing_terminals = implode(', ', $missing_terminals);
throw new PhutilUnreachableTerminalParserGeneratorException(
'Some terminals do not appear in any rule: '.
$missing_terminals);
}
}
/**
* @task rvalidation
*/
private function validateAllRulesReachable() {
$stack = array();
$reachable = $this->computeReachableRules($this->getStartRule(), $stack);
$missing = array_diff_key($this->rules, $reachable);
unset($missing[$this->getStartRule()]);
if ($missing) {
$missing_rules = array_keys($missing);
$missing_rules = implode(', ', $missing_rules);
throw new PhutilUnreachableRuleParserGeneratorException(
'Some rules can never be reached from any production: '.
$missing_rules);
}
}
/**
* @task rvalidation
*/
private function computeReachableRules($rule, array &$stack) {
if (isset($stack[$rule])) {
return $stack[$rule];
}
$stack[$rule] = array();
foreach ($this->rules[$rule] as $production) {
foreach ($production as $symbol) {
if ($this->isRule($symbol)) {
$stack[$rule][$symbol] = true;
$stack[$rule] += $this->computeReachableRules($symbol, $stack);
}
}
}
return $stack[$rule];
}
/**
* @task rvalidation
*/
private function validateAllRulesReducible() {
$reducible = array();
foreach ($this->rules as $rule => $productions) {
if (!$this->isRuleReducible($rule, $reducible)) {
throw new PhutilIrreducibleRuleParserGeneratorException(
"Rule '{$rule}' can never be reduced: it recurses indefinitely ".
"and reaches no production of terminals.");
}
}
}
/**
* @task rvalidation
*/
private function isRuleReducible($rule, array &$reducible) {
if (isset($reducible[$rule])) {
return $reducible[$rule];
}
// Set this ahead of time so we don't end up in an infinite loop if
// rules recurse. We'll overwrite it if we find a reduction.
$reducible[$rule] = false;
$reducible[$rule] = $this->computeRuleReducible($rule, $reducible);
return $reducible[$rule];
}
/**
* @task rvalidation
*/
private function computeRuleReducible($rule, array &$reducible) {
$epsilon = $this->getEpsilonSymbol();
$end = $this->getEndSymbol();
$productions = $this->rules[$rule];
// In the first pass, try to find a trivially reducible production, e.g. one
// with epsilon or only terminals. Also, remove recursive productions (those
// which directly involve the rule itself) because we know we won't be able
// to reduce them. If we're lucky, this will allow us to determine that the
// rule is reducible without recursion. For example, we can immediately
// reduce these productions:
//
// R -> a
// R -> b c d
// R -> (epsilon)
//
// We can never reduce these productions:
//
// R -> R
// R -> a R b
//
// We might be able to reduce these productions, but they aren't as cheap
// or easy to figure out, since we need to first determine if other rules
// can be reduced:
//
// R -> X Y
// R -> X a
//
// If we find a reduction, we return immediately.
foreach ($productions as $key => $production) {
$has_only_terminals = true;
foreach ($production as $symbol) {
if ($symbol == $end) {
break;
} else if ($symbol == $epsilon) {
// The rule contains an epsilon production, which can always reduce
// it.
return true;
} else if ($symbol == $rule) {
// The rule contains itself; this production is never reducible. We
// must find another reducible production.
unset($productions[$key]);
continue 2;
} else if ($this->isTerminal($symbol)) {
// This is a terminal; keep looking. We'll be able to reduce the
// production if it contains only terminals.
continue;
} else {
// This is a rule, so we can't trivially reduce it. We'll keep it
// for the next round if we can't find any trivial reductions.
$has_only_terminals = false;
break;
}
}
if ($has_only_terminals) {
return true;
}
}
// If we have no productions left, this rule can't be reduced.
if (empty($productions)) {
return false;
}
// We have remaining productions which include other rules. Look for a
// nontrivial reduction. For example:
//
// R -> X Y
// X -> x
// Y -> y
//
// In this case, X and Y are both reducible, so "X Y" is reducible and thus
// R is reducible.
foreach ($productions as $production) {
$can_reduce = true;
foreach ($production as $symbol) {
// NOTE: We don't need to check for epsilon here, because we would
// already have determined the rule was reducible if we had an epsilon
// production.
if ($symbol == $end) {
break;
} else if ($this->isTerminal($symbol)) {
continue;
} else if (!$this->isRuleReducible($symbol, $reducible)) {
$can_reduce = false;
break;
}
}
if ($can_reduce) {
// The production contained only terminals and reducible rules, so it
// is reducible. We're good and don't need to examine remaining
// productions.
return true;
}
}
// We didn't find any reducible productions.
return false;
}
/* -( Computing First() )-------------------------------------------------- */
private function buildFirstTable() {
$this->firstTable = array();
foreach ($this->rules as $rule => $productions) {
$this->buildRuleFirst($rule);
}
}
private function buildRuleFirst($rule) {
if (isset($this->firstTable[$rule])) {
return $this->firstTable[$rule];
}
$this->firstTable[$rule] = array();
$productions = $this->rules[$rule];
foreach ($productions as $key => $production) {
$this->firstTable[$rule] += $this->getFirstForProduction($production);
}
return $this->firstTable[$rule];
}
private function getFirstForProduction(array $production) {
$set = array();
$end = $this->getEndSymbol();
$epsilon = $this->getEpsilonSymbol();
$eof = $this->getEOFSymbol();
$accept_epsilon = true;
foreach ($production as $symbol) {
if ($symbol === $end) {
break;
} else if ($symbol === $epsilon) {
break;
} else if ($this->isTerminal($symbol)) {
$set[$symbol] = true;
$accept_epsilon = false;
break;
} else {
$symbol_set = $this->buildRuleFirst($symbol);
$has_epsilon = isset($symbol_set[$epsilon]);
unset($symbol_set[$epsilon]);
$set += $symbol_set;
if (!$has_epsilon) {
$accept_epsilon = false;
break;
}
}
}
if ($accept_epsilon) {
$set[$epsilon] = true;
}
return $set;
}
/* -( Computing States )--------------------------------------------------- */
private function addState(array $set) {
$seen = array();
foreach ($set as $item) {
$seen[$item[0]][$item[1]][$item[2]][$item[3]] = true;
}
$end = $this->getEndSymbol();
$epsilon = $this->getEpsilonSymbol();
for ($ii = 0; $ii < count($set); $ii++) {
$item = $set[$ii];
$production = $this->rules[$item[0]][$item[1]];
$next = $production[$item[2]];
if ($this->isTerminal($next)) {
continue;
} else if ($next === $epsilon) {
continue;
} else if ($next === $end) {
continue;
}
$v = array_slice($production, $item[2] + 1, -1);
$v[] = $item[3];
$v[] = $end;
$firsts = $this->getFirstForProduction($v);
foreach ($firsts as $nfirst => $ignored) {
if (!$this->isTerminal($nfirst)) {
unset($firsts[$nfirst]);
}
}
foreach ($this->rules[$next] as $pkey => $nproduction) {
foreach ($firsts as $nfirst => $ignored) {
if (isset($seen[$next][$pkey][0][$nfirst])) {
continue;
}
$set[] = array($next, $pkey, 0, $nfirst);
$seen[$next][$pkey][0][$nfirst] = true;
}
}
}
$hash = $this->hashSet($set);
if (isset($this->setHashes[$hash])) {
return array(false, $this->setHashes[$hash]);
}
$this->states[] = $set;
$state = last_key($this->states);
$this->setHashes[$hash] = $state;
return array(true, $state);
}
private function buildSuccessors($start_state) {
$end = $this->getEndSymbol();
$nexts = array();
foreach ($this->states[$start_state] as $item) {
$next = $this->rules[$item[0]][$item[1]][$item[2]];
if ($next === $end) {
continue;
}
$nexts[$next][] = array(
$item[0],
$item[1],
$item[2] + 1,
$item[3]);
}
foreach ($nexts as $next => $items) {
list($is_new, $state) = $this->addState($items);
$this->successor[$start_state][$next] = $state;
if ($is_new) {
$this->buildSuccessors($state);
}
}
}
private function hashSet(array $set) {
foreach ($set as $k => $item) {
$set[$k] = implode("\0", $item);
}
sort($set);
$set = implode("\1", $set);
return md5($set);
}
private function buildTables() {
$action = array();
$goto = array();
$end = $this->getEndSymbol();
$eof = $this->getEOFSymbol();
$init = $this->getInitSymbol();
foreach ($this->states as $state => $items) {
$shift = array();
$reduce = array();
$accept = false;
foreach ($items as $item) {
$next = $this->rules[$item[0]][$item[1]][$item[2]];
if ($next == $end) {
if ($item[0] !== $init) {
$reduce[$item[3]][] = $item;
} else if ($item[0] === $init && $item[3] === $eof) {
$accept = $item;
}
} else if ($this->isTerminal($next)) {
$shift[$next] = $item;
} else {
$goto[$state][$next] = $this->successor[$state][$next];
}
}
foreach ($reduce as $next => $reductions) {
if (count($reductions) > 1) {
$ways = array();
foreach ($reductions as $reduction) {
$ways[] = "{$reduction[0]}/{$reduction[1]}";
}
$ways = implode('; ', $ways);
// TODO: As below, we should have more explicit handling of
// reduce/reduce conflicts. For now, just pick the first one.
if (false) {
throw new Exception(
"Reduce/reduce conflict: from state '{$state}', when a ".
"'{$next}' is encountered, it may be reduced in multiple ".
"ways: {$ways}");
}
}
$reduce[$next] = head($reductions);
}
$srconflicts = array_intersect_key($shift, $reduce);
foreach ($srconflicts as $next => $ignored) {
// TODO: We should probably have better or more explicit handling of
// shift/reduce conflicts. For now, we just shift.
if (false) {
$what = $reduce[$next][0];
throw new Exception(
"Shift/reduce conflict: from state '{$state}', when a '{$next}' ".
"is encountered, shifting conflicts with reducing '{$what}'.");
} else {
// Resolve the shift/reduce by shifting.
$reduce = array();
}
}
if ($accept && isset($shift[$eof])) {
throw new Exception('Accept/shift conflict!');
}
if ($accept && isset($reduce[$eof])) {
throw new Exception('Accept/reduce conflict!');
}
foreach ($reduce as $next => $item) {
$action[$state][$next] = array(
'R',
array(
$item[0],
$item[1],
count($this->rules[$item[0]][$item[1]]) - 1));
}
foreach ($shift as $next => $item) {
$action[$state][$next] = array(
'S',
$this->successor[$state][$next]);
}
if ($accept) {
$action[$state][$eof] = array('A');
}
}
$this->actionTable = $action;
$this->gotoTable = $goto;
}
public function generateParserFunction($name) {
$out = array();
$out[] = 'function '.$name.'(array $tokens, $callback) {';
$out[] = ' return PhutilParserGenerator::parseTokensWithTables(';
$out[] = ' '.$this->formatAndIndent($this->actionTable, 4).',';
$out[] = ' '.$this->formatAndIndent($this->gotoTable, 4).',';
$out[] = ' '.$this->formatAndIndent($this->getEOFSymbol(), 4).',';
$out[] = ' $tokens,';
$out[] = ' $callback);';
$out[] = '}';
return implode("\n", $out);
}
private function formatAndIndent($var, $depth) {
$var = phutil_var_export($var);
$var = str_replace("\n", "\n".str_repeat(' ', $depth), $var);
return $var;
}
public function parseTokens(array $tokens, $callback) {
return self::parseTokensWithTables(
$this->actionTable,
$this->gotoTable,
$this->getEOFSymbol(),
$tokens,
$callback);
}
public static function parseTokensWithTables(
$action_table,
$goto_table,
$eof_symbol,
array $tokens,
$callback) {
$state_stack = array(0);
$token_stack = array();
$tokens = array_reverse($tokens);
while (true) {
$state = end($state_stack);
if (empty($tokens)) {
$next = $eof_symbol;
} else {
$next_token = end($tokens);
$next = $next_token[0];
}
if (!isset($action_table[$state][$next])) {
$expected = implode(', ', array_keys($action_table[$state]));
throw new Exception(
"Unexpected '{$next}' in state {$state}! Expected: ".
$expected);
}
$action = $action_table[$state][$next];
switch ($action[0]) {
case 'S':
$state_stack[] = $action[1];
$token_stack[] = array_pop($tokens);
break;
case 'R':
$r_rule = $action[1][0];
$r_prod = $action[1][1];
$r_size = $action[1][2];
$token_v = array();
while ($r_size--) {
$token_v[] = array_pop($token_stack);
array_pop($state_stack);
}
$token_v = array_reverse($token_v);
$token_stack[] = call_user_func_array(
$callback,
array($r_rule, $r_prod, $token_v));
$goto = $goto_table[end($state_stack)][$r_rule];
$state_stack[] = $goto;
break;
case 'A':
break 2;
}
}
return head($token_stack);
}
/* -( Inspecting Generator State )----------------------------------------- */
/**
* @task inspect
*/
public function inspectRules() {
if (!$this->rulesValidated) {
throw new Exception('Call processGrammar() before inspectRules()!');
}
return $this->rules;
}
/**
* @task inspect
*/
public function inspectFirstTable() {
if ($this->firstTable === null) {
throw new Exception('Call processGrammar() before inspectFirstTable()!');
}
return $this->firstTable;
}
}

Event Timeline