Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F101574712
create-tokenizer.js
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Tue, Feb 11, 17:19
Size
11 KB
Mime Type
text/x-c
Expires
Thu, Feb 13, 17:19 (1 d, 20 h)
Engine
blob
Format
Raw Data
Handle
24186281
Attached To
rOACCT Open Access Compliance Check Tool (OACCT)
create-tokenizer.js
View Options
'use strict'
var
assert
=
require
(
'assert'
)
var
createDebug
=
require
(
'debug'
)
var
assign
=
require
(
'../constant/assign.js'
)
var
codes
=
require
(
'../character/codes.js'
)
var
markdownLineEnding
=
require
(
'../character/markdown-line-ending.js'
)
var
chunkedPush
=
require
(
'./chunked-push.js'
)
var
chunkedSplice
=
require
(
'./chunked-splice.js'
)
var
miniflat
=
require
(
'./miniflat.js'
)
var
resolveAll
=
require
(
'./resolve-all.js'
)
var
serializeChunks
=
require
(
'./serialize-chunks.js'
)
var
shallow
=
require
(
'./shallow.js'
)
var
sliceChunks
=
require
(
'./slice-chunks.js'
)
function
_interopDefaultLegacy
(
e
)
{
return
e
&&
typeof
e
===
'object'
&&
'default'
in
e
?
e
:
{
default
:
e
}
}
var
assert__default
=
/*#__PURE__*/
_interopDefaultLegacy
(
assert
)
var
createDebug__default
=
/*#__PURE__*/
_interopDefaultLegacy
(
createDebug
)
var
debug
=
createDebug__default
[
'default'
](
'micromark'
)
// Create a tokenizer.
// Tokenizers deal with one type of data (e.g., containers, flow, text).
// The parser is the object dealing with it all.
// `initialize` works like other constructs, except that only its `tokenize`
// function is used, in which case it doesn’t receive an `ok` or `nok`.
// `from` can be given to set the point before the first character, although
// when further lines are indented, they must be set with `defineSkip`.
function
createTokenizer
(
parser
,
initialize
,
from
)
{
var
point
=
from
?
shallow
(
from
)
:
{
line
:
1
,
column
:
1
,
offset
:
0
}
var
columnStart
=
{}
var
resolveAllConstructs
=
[]
var
chunks
=
[]
var
stack
=
[]
var
consumed
=
true
// Tools used for tokenizing.
var
effects
=
{
consume
:
consume
,
enter
:
enter
,
exit
:
exit
,
attempt
:
constructFactory
(
onsuccessfulconstruct
),
check
:
constructFactory
(
onsuccessfulcheck
),
interrupt
:
constructFactory
(
onsuccessfulcheck
,
{
interrupt
:
true
}),
lazy
:
constructFactory
(
onsuccessfulcheck
,
{
lazy
:
true
})
}
// State and tools for resolving and serializing.
var
context
=
{
previous
:
codes
.
eof
,
events
:
[],
parser
:
parser
,
sliceStream
:
sliceStream
,
sliceSerialize
:
sliceSerialize
,
now
:
now
,
defineSkip
:
skip
,
write
:
write
}
// The state function.
var
state
=
initialize
.
tokenize
.
call
(
context
,
effects
)
// Track which character we expect to be consumed, to catch bugs.
var
expectedCode
if
(
initialize
.
resolveAll
)
{
resolveAllConstructs
.
push
(
initialize
)
}
// Store where we are in the input stream.
point
.
_index
=
0
point
.
_bufferIndex
=
-
1
return
context
function
write
(
slice
)
{
chunks
=
chunkedPush
(
chunks
,
slice
)
main
()
// Exit if we’re not done, resolve might change stuff.
if
(
chunks
[
chunks
.
length
-
1
]
!==
codes
.
eof
)
{
return
[]
}
addResult
(
initialize
,
0
)
// Otherwise, resolve, and exit.
context
.
events
=
resolveAll
(
resolveAllConstructs
,
context
.
events
,
context
)
return
context
.
events
}
//
// Tools.
//
function
sliceSerialize
(
token
)
{
return
serializeChunks
(
sliceStream
(
token
))
}
function
sliceStream
(
token
)
{
return
sliceChunks
(
chunks
,
token
)
}
function
now
()
{
return
shallow
(
point
)
}
function
skip
(
value
)
{
columnStart
[
value
.
line
]
=
value
.
column
accountForPotentialSkip
()
debug
(
'position: define skip: `%j`'
,
point
)
}
//
// State management.
//
// Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
// `consume`).
// Here is where we walk through the chunks, which either include strings of
// several characters, or numerical character codes.
// The reason to do this in a loop instead of a call is so the stack can
// drain.
function
main
()
{
var
chunkIndex
var
chunk
while
(
point
.
_index
<
chunks
.
length
)
{
chunk
=
chunks
[
point
.
_index
]
// If we’re in a buffer chunk, loop through it.
if
(
typeof
chunk
===
'string'
)
{
chunkIndex
=
point
.
_index
if
(
point
.
_bufferIndex
<
0
)
{
point
.
_bufferIndex
=
0
}
while
(
point
.
_index
===
chunkIndex
&&
point
.
_bufferIndex
<
chunk
.
length
)
{
go
(
chunk
.
charCodeAt
(
point
.
_bufferIndex
))
}
}
else
{
go
(
chunk
)
}
}
}
// Deal with one code.
function
go
(
code
)
{
assert__default
[
'default'
].
equal
(
consumed
,
true
,
'expected character to be consumed'
)
consumed
=
undefined
debug
(
'main: passing `%s` to %s'
,
code
,
state
.
name
)
expectedCode
=
code
state
=
state
(
code
)
}
// Move a character forward.
function
consume
(
code
)
{
assert__default
[
'default'
].
equal
(
code
,
expectedCode
,
'expected given code to equal expected code'
)
debug
(
'consume: `%s`'
,
code
)
assert__default
[
'default'
].
equal
(
consumed
,
undefined
,
'expected code to not have been consumed'
)
assert__default
[
'default'
](
code
===
null
?
!
context
.
events
.
length
||
context
.
events
[
context
.
events
.
length
-
1
][
0
]
===
'exit'
:
context
.
events
[
context
.
events
.
length
-
1
][
0
]
===
'enter'
,
'expected last token to be open'
)
if
(
markdownLineEnding
(
code
))
{
point
.
line
++
point
.
column
=
1
point
.
offset
+=
code
===
codes
.
carriageReturnLineFeed
?
2
:
1
accountForPotentialSkip
()
debug
(
'position: after eol: `%j`'
,
point
)
}
else
if
(
code
!==
codes
.
virtualSpace
)
{
point
.
column
++
point
.
offset
++
}
// Not in a string chunk.
if
(
point
.
_bufferIndex
<
0
)
{
point
.
_index
++
}
else
{
point
.
_bufferIndex
++
// At end of string chunk.
if
(
point
.
_bufferIndex
===
chunks
[
point
.
_index
].
length
)
{
point
.
_bufferIndex
=
-
1
point
.
_index
++
}
}
// Expose the previous character.
context
.
previous
=
code
// Mark as consumed.
consumed
=
true
}
// Start a token.
function
enter
(
type
,
fields
)
{
var
token
=
fields
||
{}
token
.
type
=
type
token
.
start
=
now
()
assert__default
[
'default'
].
equal
(
typeof
type
,
'string'
,
'expected string type'
)
assert__default
[
'default'
].
notEqual
(
type
.
length
,
0
,
'expected non-empty string'
)
debug
(
'enter: `%s`'
,
type
)
context
.
events
.
push
([
'enter'
,
token
,
context
])
stack
.
push
(
token
)
return
token
}
// Stop a token.
function
exit
(
type
)
{
assert__default
[
'default'
].
equal
(
typeof
type
,
'string'
,
'expected string type'
)
assert__default
[
'default'
].
notEqual
(
type
.
length
,
0
,
'expected non-empty string'
)
assert__default
[
'default'
].
notEqual
(
stack
.
length
,
0
,
'cannot close w/o open tokens'
)
var
token
=
stack
.
pop
()
token
.
end
=
now
()
assert__default
[
'default'
].
equal
(
type
,
token
.
type
,
'expected exit token to match current token'
)
assert__default
[
'default'
](
!
(
token
.
start
.
_index
===
token
.
end
.
_index
&&
token
.
start
.
_bufferIndex
===
token
.
end
.
_bufferIndex
),
'expected non-empty token (`'
+
type
+
'`)'
)
debug
(
'exit: `%s`'
,
token
.
type
)
context
.
events
.
push
([
'exit'
,
token
,
context
])
return
token
}
// Use results.
function
onsuccessfulconstruct
(
construct
,
info
)
{
addResult
(
construct
,
info
.
from
)
}
// Discard results.
function
onsuccessfulcheck
(
construct
,
info
)
{
info
.
restore
()
}
// Factory to attempt/check/interrupt.
function
constructFactory
(
onreturn
,
fields
)
{
return
hook
// Handle either an object mapping codes to constructs, a list of
// constructs, or a single construct.
function
hook
(
constructs
,
returnState
,
bogusState
)
{
var
listOfConstructs
var
constructIndex
var
currentConstruct
var
info
return
constructs
.
tokenize
||
'length'
in
constructs
?
handleListOfConstructs
(
miniflat
(
constructs
))
:
handleMapOfConstructs
function
handleMapOfConstructs
(
code
)
{
if
(
code
in
constructs
||
codes
.
eof
in
constructs
)
{
return
handleListOfConstructs
(
constructs
.
null
?
/* c8 ignore next */
miniflat
(
constructs
[
code
]).
concat
(
miniflat
(
constructs
.
null
))
:
constructs
[
code
]
)(
code
)
}
return
bogusState
(
code
)
}
function
handleListOfConstructs
(
list
)
{
listOfConstructs
=
list
constructIndex
=
0
return
handleConstruct
(
list
[
constructIndex
])
}
function
handleConstruct
(
construct
)
{
return
start
function
start
(
code
)
{
// To do: not nede to store if there is no bogus state, probably?
// Currently doesn’t work because `inspect` in document does a check
// w/o a bogus, which doesn’t make sense. But it does seem to help perf
// by not storing.
info
=
store
()
currentConstruct
=
construct
if
(
!
construct
.
partial
)
{
context
.
currentConstruct
=
construct
}
if
(
construct
.
name
&&
context
.
parser
.
constructs
.
disable
.
null
.
indexOf
(
construct
.
name
)
>
-
1
)
{
return
nok
(
code
)
}
return
construct
.
tokenize
.
call
(
fields
?
assign
({},
context
,
fields
)
:
context
,
effects
,
ok
,
nok
)(
code
)
}
}
function
ok
(
code
)
{
assert__default
[
'default'
].
equal
(
code
,
expectedCode
,
'expected code'
)
consumed
=
true
onreturn
(
currentConstruct
,
info
)
return
returnState
}
function
nok
(
code
)
{
assert__default
[
'default'
].
equal
(
code
,
expectedCode
,
'expected code'
)
consumed
=
true
info
.
restore
()
if
(
++
constructIndex
<
listOfConstructs
.
length
)
{
return
handleConstruct
(
listOfConstructs
[
constructIndex
])
}
return
bogusState
}
}
}
function
addResult
(
construct
,
from
)
{
if
(
construct
.
resolveAll
&&
resolveAllConstructs
.
indexOf
(
construct
)
<
0
)
{
resolveAllConstructs
.
push
(
construct
)
}
if
(
construct
.
resolve
)
{
chunkedSplice
(
context
.
events
,
from
,
context
.
events
.
length
-
from
,
construct
.
resolve
(
context
.
events
.
slice
(
from
),
context
)
)
}
if
(
construct
.
resolveTo
)
{
context
.
events
=
construct
.
resolveTo
(
context
.
events
,
context
)
}
assert__default
[
'default'
](
construct
.
partial
||
!
context
.
events
.
length
||
context
.
events
[
context
.
events
.
length
-
1
][
0
]
===
'exit'
,
'expected last token to end'
)
}
function
store
()
{
var
startPoint
=
now
()
var
startPrevious
=
context
.
previous
var
startCurrentConstruct
=
context
.
currentConstruct
var
startEventsIndex
=
context
.
events
.
length
var
startStack
=
Array
.
from
(
stack
)
return
{
restore
:
restore
,
from
:
startEventsIndex
}
function
restore
()
{
point
=
startPoint
context
.
previous
=
startPrevious
context
.
currentConstruct
=
startCurrentConstruct
context
.
events
.
length
=
startEventsIndex
stack
=
startStack
accountForPotentialSkip
()
debug
(
'position: restore: `%j`'
,
point
)
}
}
function
accountForPotentialSkip
()
{
if
(
point
.
line
in
columnStart
&&
point
.
column
<
2
)
{
point
.
column
=
columnStart
[
point
.
line
]
point
.
offset
+=
columnStart
[
point
.
line
]
-
1
}
}
}
module
.
exports
=
createTokenizer
Event Timeline
Log In to Comment