Page Menu
Home
c4science
Search
Configure Global Search
Log In
Files
F101287045
create-tokenizer.mjs
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Award Token
Subscribers
None
File Metadata
Details
File Info
Storage
Attached
Created
Fri, Feb 7, 12:10
Size
10 KB
Mime Type
text/x-c
Expires
Sun, Feb 9, 12:10 (1 d, 21 h)
Engine
blob
Format
Raw Data
Handle
24115261
Attached To
rOACCT Open Access Compliance Check Tool (OACCT)
create-tokenizer.mjs
View Options
export default createTokenizer
import assert from 'assert'
import createDebug from 'debug'
import assign from '../constant/assign.mjs'
import codes from '../character/codes.mjs'
import markdownLineEnding from '../character/markdown-line-ending.mjs'
import chunkedPush from './chunked-push.mjs'
import chunkedSplice from './chunked-splice.mjs'
import miniflat from './miniflat.mjs'
import resolveAll from './resolve-all.mjs'
import serializeChunks from './serialize-chunks.mjs'
import shallow from './shallow.mjs'
import sliceChunks from './slice-chunks.mjs'
var debug = createDebug('micromark')
// Create a tokenizer.
// Tokenizers deal with one type of data (e.g., containers, flow, text).
// The parser is the object dealing with it all.
// `initialize` works like other constructs, except that only its `tokenize`
// function is used, in which case it doesn’t receive an `ok` or `nok`.
// `from` can be given to set the point before the first character, although
// when further lines are indented, they must be set with `defineSkip`.
function createTokenizer(parser, initialize, from) {
var point = from ? shallow(from) : {line: 1, column: 1, offset: 0}
var columnStart = {}
var resolveAllConstructs = []
var chunks = []
var stack = []
var consumed = true
// Tools used for tokenizing.
var effects = {
consume: consume,
enter: enter,
exit: exit,
attempt: constructFactory(onsuccessfulconstruct),
check: constructFactory(onsuccessfulcheck),
interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}),
lazy: constructFactory(onsuccessfulcheck, {lazy: true})
}
// State and tools for resolving and serializing.
var context = {
previous: codes.eof,
events: [],
parser: parser,
sliceStream: sliceStream,
sliceSerialize: sliceSerialize,
now: now,
defineSkip: skip,
write: write
}
// The state function.
var state = initialize.tokenize.call(context, effects)
// Track which character we expect to be consumed, to catch bugs.
var expectedCode
if (initialize.resolveAll) {
resolveAllConstructs.push(initialize)
}
// Store where we are in the input stream.
point._index = 0
point._bufferIndex = -1
return context
function write(slice) {
chunks = chunkedPush(chunks, slice)
main()
// Exit if we’re not done, resolve might change stuff.
if (chunks[chunks.length - 1] !== codes.eof) {
return []
}
addResult(initialize, 0)
// Otherwise, resolve, and exit.
context.events = resolveAll(resolveAllConstructs, context.events, context)
return context.events
}
//
// Tools.
//
function sliceSerialize(token) {
return serializeChunks(sliceStream(token))
}
function sliceStream(token) {
return sliceChunks(chunks, token)
}
function now() {
return shallow(point)
}
function skip(value) {
columnStart[value.line] = value.column
accountForPotentialSkip()
debug('position: define skip: `%j`', point)
}
//
// State management.
//
// Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
// `consume`).
// Here is where we walk through the chunks, which either include strings of
// several characters, or numerical character codes.
// The reason to do this in a loop instead of a call is so the stack can
// drain.
function main() {
var chunkIndex
var chunk
while (point._index < chunks.length) {
chunk = chunks[point._index]
// If we’re in a buffer chunk, loop through it.
if (typeof chunk === 'string') {
chunkIndex = point._index
if (point._bufferIndex < 0) {
point._bufferIndex = 0
}
while (
point._index === chunkIndex &&
point._bufferIndex < chunk.length
) {
go(chunk.charCodeAt(point._bufferIndex))
}
} else {
go(chunk)
}
}
}
// Deal with one code.
function go(code) {
assert.equal(consumed, true, 'expected character to be consumed')
consumed = undefined
debug('main: passing `%s` to %s', code, state.name)
expectedCode = code
state = state(code)
}
// Move a character forward.
function consume(code) {
assert.equal(
code,
expectedCode,
'expected given code to equal expected code'
)
debug('consume: `%s`', code)
assert.equal(consumed, undefined, 'expected code to not have been consumed')
assert(
code === null
? !context.events.length ||
context.events[context.events.length - 1][0] === 'exit'
: context.events[context.events.length - 1][0] === 'enter',
'expected last token to be open'
)
if (markdownLineEnding(code)) {
point.line++
point.column = 1
point.offset += code === codes.carriageReturnLineFeed ? 2 : 1
accountForPotentialSkip()
debug('position: after eol: `%j`', point)
} else if (code !== codes.virtualSpace) {
point.column++
point.offset++
}
// Not in a string chunk.
if (point._bufferIndex < 0) {
point._index++
} else {
point._bufferIndex++
// At end of string chunk.
if (point._bufferIndex === chunks[point._index].length) {
point._bufferIndex = -1
point._index++
}
}
// Expose the previous character.
context.previous = code
// Mark as consumed.
consumed = true
}
// Start a token.
function enter(type, fields) {
var token = fields || {}
token.type = type
token.start = now()
assert.equal(typeof type, 'string', 'expected string type')
assert.notEqual(type.length, 0, 'expected non-empty string')
debug('enter: `%s`', type)
context.events.push(['enter', token, context])
stack.push(token)
return token
}
// Stop a token.
function exit(type) {
assert.equal(typeof type, 'string', 'expected string type')
assert.notEqual(type.length, 0, 'expected non-empty string')
assert.notEqual(stack.length, 0, 'cannot close w/o open tokens')
var token = stack.pop()
token.end = now()
assert.equal(type, token.type, 'expected exit token to match current token')
assert(
!(
token.start._index === token.end._index &&
token.start._bufferIndex === token.end._bufferIndex
),
'expected non-empty token (`' + type + '`)'
)
debug('exit: `%s`', token.type)
context.events.push(['exit', token, context])
return token
}
// Use results.
function onsuccessfulconstruct(construct, info) {
addResult(construct, info.from)
}
// Discard results.
function onsuccessfulcheck(construct, info) {
info.restore()
}
// Factory to attempt/check/interrupt.
function constructFactory(onreturn, fields) {
return hook
// Handle either an object mapping codes to constructs, a list of
// constructs, or a single construct.
function hook(constructs, returnState, bogusState) {
var listOfConstructs
var constructIndex
var currentConstruct
var info
return constructs.tokenize || 'length' in constructs
? handleListOfConstructs(miniflat(constructs))
: handleMapOfConstructs
function handleMapOfConstructs(code) {
if (code in constructs || codes.eof in constructs) {
return handleListOfConstructs(
constructs.null
? /* c8 ignore next */
miniflat(constructs[code]).concat(miniflat(constructs.null))
: constructs[code]
)(code)
}
return bogusState(code)
}
function handleListOfConstructs(list) {
listOfConstructs = list
constructIndex = 0
return handleConstruct(list[constructIndex])
}
function handleConstruct(construct) {
return start
function start(code) {
// To do: not nede to store if there is no bogus state, probably?
// Currently doesn’t work because `inspect` in document does a check
// w/o a bogus, which doesn’t make sense. But it does seem to help perf
// by not storing.
info = store()
currentConstruct = construct
if (!construct.partial) {
context.currentConstruct = construct
}
if (
construct.name &&
context.parser.constructs.disable.null.indexOf(construct.name) > -1
) {
return nok(code)
}
return construct.tokenize.call(
fields ? assign({}, context, fields) : context,
effects,
ok,
nok
)(code)
}
}
function ok(code) {
assert.equal(code, expectedCode, 'expected code')
consumed = true
onreturn(currentConstruct, info)
return returnState
}
function nok(code) {
assert.equal(code, expectedCode, 'expected code')
consumed = true
info.restore()
if (++constructIndex < listOfConstructs.length) {
return handleConstruct(listOfConstructs[constructIndex])
}
return bogusState
}
}
}
function addResult(construct, from) {
if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) {
resolveAllConstructs.push(construct)
}
if (construct.resolve) {
chunkedSplice(
context.events,
from,
context.events.length - from,
construct.resolve(context.events.slice(from), context)
)
}
if (construct.resolveTo) {
context.events = construct.resolveTo(context.events, context)
}
assert(
construct.partial ||
!context.events.length ||
context.events[context.events.length - 1][0] === 'exit',
'expected last token to end'
)
}
function store() {
var startPoint = now()
var startPrevious = context.previous
var startCurrentConstruct = context.currentConstruct
var startEventsIndex = context.events.length
var startStack = Array.from(stack)
return {restore: restore, from: startEventsIndex}
function restore() {
point = startPoint
context.previous = startPrevious
context.currentConstruct = startCurrentConstruct
context.events.length = startEventsIndex
stack = startStack
accountForPotentialSkip()
debug('position: restore: `%j`', point)
}
}
function accountForPotentialSkip() {
if (point.line in columnStart && point.column < 2) {
point.column = columnStart[point.line]
point.offset += columnStart[point.line] - 1
}
}
}
Event Timeline
Log In to Comment