Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F92595169
PhutilLexer.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Thu, Nov 21, 20:07
Size
8 KB
Mime Type
text/x-php
Expires
Sat, Nov 23, 20:07 (2 d)
Engine
blob
Format
Raw Data
Handle
22466094
Attached To
rPHU libphutil
PhutilLexer.php
View Options
<?php
/**
* Slow, inefficient regexp-based lexer. Define rules like this:
*
* array(
* 'start' => array(...),
* 'state1' => array(...),
* 'state2' => array(...),
* )
*
* Lexers start at the state named 'start'. Each state should have a list of
* rules which can match in that state. A list of rules looks like this:
*
* array(
* array('\s+', 'space'),
* array('\d+', 'digit'),
* array('\w+', 'word'),
* )
*
* The lexer operates by processing each rule in the current state in order.
* When one matches, it produces a token. For example, the lexer above would
* lex this text:
*
* 3 asdf
*
* ...to produce these tokens (assuming the rules are for the 'start' state):
*
* array('digit', '3', null),
* array('space', ' ', null),
* array('word', 'asdf', null),
*
* A rule can also cause a state transition:
*
* array('zebra', 'animal', 'saw_zebra'),
*
* This would match the text "zebra", emit a token of type "animal", and change
* the parser state to "saw_zebra", causing the lexer to start using the rules
* from that state.
*
* To pop the lexer's state, you can use the special state '!pop'.
*
* Finally, you can provide additional options in the fourth parameter.
* Supported options are `case-insensitive` and `context`.
*
* Possible values for `context` are `push` (push the token value onto the
* context stack), `pop` (pop the context stack and use it to provide context
* for the token), and `discard` (pop the context stack and throw away the
* value).
*
* For example, to lex text like this:
*
* Class::CONSTANT
*
* You can use a rule set like this:
*
* 'start' => array(
* array('\w+(?=::)', 'class', 'saw_class', array('context' => 'push')),
* ),
* 'saw_class' => array(
* array('::', 'operator'),
* array('\w+', 'constant, '!pop', array('context' => 'pop')),
* ),
*
* This would parse the above text into this token stream:
*
* array('class', 'Class', null),
* array('operator', '::', null),
* array('constant', 'CONSTANT', 'Class'),
*
* For a concrete implementation, see @{class:PhutilPHPFragmentLexer}.
*
* @task lexerimpl Lexer Implementation
* @task rule Lexer Rules
* @task tokens Lexer Tokens
*
* @group lexer
*/
abstract
class
PhutilLexer
{
private
$processedRules
;
/* -( Lexer Rules )-------------------------------------------------------- */
/**
* Return a set of rules for this lexer. See description in
* @{class:PhutilLexer}.
*
* @return dict Lexer rules.
* @task lexerimpl
*/
abstract
protected
function
getRawRules
();
/* -( Lexer Rules )-------------------------------------------------------- */
/**
* Process, normalize, and validate the raw lexer rules.
*
* @task rule
*/
protected
function
getRules
()
{
$class
=
get_class
(
$this
);
$raw_rules
=
$this
->
getRawRules
();
if
(!
is_array
(
$raw_rules
))
{
$type
=
gettype
(
$raw_rules
);
throw
new
UnexpectedValueException
(
"Expected {$class}->getRawRules() to return array, got {$type}."
);
}
if
(
empty
(
$raw_rules
[
'start'
]))
{
throw
new
UnexpectedValueException
(
"Expected {$class} rules to define rules for state 'start'."
);
}
$processed_rules
=
array
();
foreach
(
$raw_rules
as
$state
=>
$rules
)
{
if
(!
is_array
(
$rules
))
{
$type
=
gettype
(
$rules
);
throw
new
UnexpectedValueException
(
"Expected list of rules for state '{$state}' in {$class}, got "
.
"{$type}."
);
}
foreach
(
$rules
as
$key
=>
$rule
)
{
$n
=
count
(
$rule
);
if
(
$n
<
2
||
$n
>
4
)
{
throw
new
UnexpectedValueException
(
"Expected rule '{$key}' in state '{$state}' in {$class} to have "
.
"2-4 elements (regex, token, [next state], [options]), got {$n}."
);
}
$rule
=
array_values
(
$rule
);
if
(
count
(
$rule
)
==
2
)
{
$rule
[]
=
null
;
}
if
(
count
(
$rule
)
==
3
)
{
$rule
[]
=
array
();
}
foreach
(
$rule
[
3
]
as
$option
=>
$value
)
{
switch
(
$option
)
{
case
'context'
:
if
(
$value
!==
'push'
&&
$value
!==
'pop'
&&
$value
!==
'discard'
&&
$value
!==
null
)
{
throw
new
UnexpectedValueException
(
"Rule '{$key}' in state '{$state}' in {$class} has unknown "
.
"context rule '{$value}', expected 'push', 'pop' or "
.
"'discard'."
);
}
break
;
default
:
throw
new
UnexpectedValueException
(
"Rule '{$key}' in state '{$state}' in {$class} has unknown "
.
"option '{$option}'."
);
}
}
$flags
=
'sS'
;
// NOTE: The "\G" assertion is an offset-aware version of "^".
$rule
[
0
]
=
'(
\\
G'
.
$rule
[
0
].
')'
.
$flags
;
if
(@
preg_match
(
$rule
[
0
],
''
)
===
false
)
{
$error
=
error_get_last
();
throw
new
UnexpectedValueException
(
"Rule '{$key}' in state '{$state}' in {$class} defines an "
.
"invalid regular expression ('{$rule[0]}'): "
.
idx
(
$error
,
'message'
));
}
$next_state
=
$rule
[
2
];
if
(
$next_state
!==
null
&&
$next_state
!==
'!pop'
)
{
if
(
empty
(
$raw_rules
[
$next_state
]))
{
throw
new
UnexpectedValueException
(
"Rule '{$key}' in state '{$state}' in {$class} transitions to "
.
"state '{$next_state}', but there are no rules for that state."
);
}
}
$processed_rules
[
$state
][]
=
$rule
;
}
}
return
$processed_rules
;
}
/* -( Lexer Tokens )------------------------------------------------------- */
/**
* Lex an input string into tokens.
*
* @param string Input string.
* @param string Initial lexer state.
* @return list List of lexer tokens.
* @task tokens
*/
public
function
getTokens
(
$input
,
$initial_state
=
'start'
)
{
if
(
empty
(
$this
->
processedRules
))
{
$this
->
processedRules
=
$this
->
getRules
();
}
$rules
=
$this
->
processedRules
;
$position
=
0
;
$length
=
strlen
(
$input
);
$tokens
=
array
();
$states
=
array
();
$states
[]
=
'start'
;
if
(
$initial_state
!=
'start'
)
{
$states
[]
=
$initial_state
;
}
$context
=
array
();
while
(
$position
<
$length
)
{
$state_rules
=
idx
(
$rules
,
end
(
$states
),
array
());
foreach
(
$state_rules
as
$rule
)
{
$matches
=
null
;
if
(!
preg_match
(
$rule
[
0
],
$input
,
$matches
,
0
,
$position
))
{
continue
;
}
list
(
$regexp
,
$token_type
,
$next_state
,
$options
)
=
$rule
;
$match_length
=
strlen
(
$matches
[
0
]);
if
(!
$match_length
)
{
if
(
$next_state
===
null
)
{
throw
new
UnexpectedValueException
(
"Rule '{$regexp}' matched a zero-length token and causes no "
.
"state transition."
);
}
}
else
{
$position
+=
$match_length
;
$token
=
array
(
$token_type
,
$matches
[
0
]);
$copt
=
idx
(
$options
,
'context'
);
if
(
$copt
==
'push'
)
{
$context
[]
=
$matches
[
0
];
$token
[]
=
null
;
}
else
if
(
$copt
==
'pop'
)
{
if
(
empty
(
$context
))
{
throw
new
UnexpectedValueException
(
"Rule '{$regexp}' popped empty context!"
);
}
$token
[]
=
array_pop
(
$context
);
}
else
if
(
$copt
==
'discard'
)
{
if
(
empty
(
$context
))
{
throw
new
UnexpectedValueException
(
"Rule '{$regexp}' discarded empty context!"
);
}
array_pop
(
$context
);
$token
[]
=
null
;
}
else
{
$token
[]
=
null
;
}
$tokens
[]
=
$token
;
}
if
(
$next_state
!==
null
)
{
if
(
$next_state
==
'!pop'
)
{
array_pop
(
$states
);
if
(
empty
(
$states
))
{
throw
new
UnexpectedValueException
(
"Rule '{$regexp}' popped off the last state."
);
}
}
else
{
$states
[]
=
$next_state
;
}
}
continue
2
;
}
throw
new
UnexpectedValueException
(
"No lexer rule matched input at char {$position}."
);
}
return
$tokens
;
}
/**
* Merge adjacent tokens of the same type. For example, if a comment is
* tokenized as <"//", "comment">, this method will merge the two tokens into
* a single combined token.
*/
public
function
mergeTokens
(
array
$tokens
)
{
$last
=
null
;
$result
=
array
();
foreach
(
$tokens
as
$token
)
{
if
(
$last
===
null
)
{
$last
=
$token
;
continue
;
}
if
((
$token
[
0
]
==
$last
[
0
])
&&
(
$token
[
2
]
==
$last
[
2
]))
{
$last
[
1
]
.=
$token
[
1
];
}
else
{
$result
[]
=
$last
;
$last
=
$token
;
}
}
if
(
$last
!==
null
)
{
$result
[]
=
$last
;
}
return
$result
;
}
}
Event Timeline
Log In to Comment