create-tokenizer.mjs
No OneTemporary
Actions

Subscribers

None

File Metadata

Created: Fri, Feb 7, 12:10

create-tokenizer.mjs
View Options

	export default createTokenizer

	import assert from 'assert'
	import createDebug from 'debug'
	import assign from '../constant/assign.mjs'
	import codes from '../character/codes.mjs'
	import markdownLineEnding from '../character/markdown-line-ending.mjs'
	import chunkedPush from './chunked-push.mjs'
	import chunkedSplice from './chunked-splice.mjs'
	import miniflat from './miniflat.mjs'
	import resolveAll from './resolve-all.mjs'
	import serializeChunks from './serialize-chunks.mjs'
	import shallow from './shallow.mjs'
	import sliceChunks from './slice-chunks.mjs'

	var debug = createDebug('micromark')

	// Create a tokenizer.
	// Tokenizers deal with one type of data (e.g., containers, flow, text).
	// The parser is the object dealing with it all.
	// `initialize` works like other constructs, except that only its `tokenize`
	// function is used, in which case it doesn’t receive an `ok` or `nok`.
	// `from` can be given to set the point before the first character, although
	// when further lines are indented, they must be set with `defineSkip`.
	function createTokenizer(parser, initialize, from) {
	var point = from ? shallow(from) : {line: 1, column: 1, offset: 0}
	var columnStart = {}
	var resolveAllConstructs = []
	var chunks = []
	var stack = []
	var consumed = true

	// Tools used for tokenizing.
	var effects = {
	consume: consume,
	enter: enter,
	exit: exit,
	attempt: constructFactory(onsuccessfulconstruct),
	check: constructFactory(onsuccessfulcheck),
	interrupt: constructFactory(onsuccessfulcheck, {interrupt: true}),
	lazy: constructFactory(onsuccessfulcheck, {lazy: true})
	}

	// State and tools for resolving and serializing.
	var context = {
	previous: codes.eof,
	events: [],
	parser: parser,
	sliceStream: sliceStream,
	sliceSerialize: sliceSerialize,
	now: now,
	defineSkip: skip,
	write: write
	}

	// The state function.
	var state = initialize.tokenize.call(context, effects)

	// Track which character we expect to be consumed, to catch bugs.
	var expectedCode

	if (initialize.resolveAll) {
	resolveAllConstructs.push(initialize)
	}

	// Store where we are in the input stream.
	point._index = 0
	point._bufferIndex = -1

	return context

	function write(slice) {
	chunks = chunkedPush(chunks, slice)

	main()

	// Exit if we’re not done, resolve might change stuff.
	if (chunks[chunks.length - 1] !== codes.eof) {
	return []
	}

	addResult(initialize, 0)

	// Otherwise, resolve, and exit.
	context.events = resolveAll(resolveAllConstructs, context.events, context)

	return context.events
	}

	//
	// Tools.
	//

	function sliceSerialize(token) {
	return serializeChunks(sliceStream(token))
	}

	function sliceStream(token) {
	return sliceChunks(chunks, token)
	}

	function now() {
	return shallow(point)
	}

	function skip(value) {
	columnStart[value.line] = value.column
	accountForPotentialSkip()
	debug('position: define skip: `%j`', point)
	}

	//
	// State management.
	//

	// Main loop (note that `_index` and `_bufferIndex` in `point` are modified by
	// `consume`).
	// Here is where we walk through the chunks, which either include strings of
	// several characters, or numerical character codes.
	// The reason to do this in a loop instead of a call is so the stack can
	// drain.
	function main() {
	var chunkIndex
	var chunk

	while (point._index < chunks.length) {
	chunk = chunks[point._index]

	// If we’re in a buffer chunk, loop through it.
	if (typeof chunk === 'string') {
	chunkIndex = point._index

	if (point._bufferIndex < 0) {
	point._bufferIndex = 0
	}

	while (
	point._index === chunkIndex &&
	point._bufferIndex < chunk.length
	) {
	go(chunk.charCodeAt(point._bufferIndex))
	}
	} else {
	go(chunk)
	}
	}
	}

	// Deal with one code.
	function go(code) {
	assert.equal(consumed, true, 'expected character to be consumed')
	consumed = undefined
	debug('main: passing `%s` to %s', code, state.name)
	expectedCode = code
	state = state(code)
	}

	// Move a character forward.
	function consume(code) {
	assert.equal(
	code,
	expectedCode,
	'expected given code to equal expected code'
	)

	debug('consume: `%s`', code)

	assert.equal(consumed, undefined, 'expected code to not have been consumed')
	assert(
	code === null
	? !context.events.length \|\|
	context.events[context.events.length - 1][0] === 'exit'
	: context.events[context.events.length - 1][0] === 'enter',
	'expected last token to be open'
	)

	if (markdownLineEnding(code)) {
	point.line++
	point.column = 1
	point.offset += code === codes.carriageReturnLineFeed ? 2 : 1
	accountForPotentialSkip()
	debug('position: after eol: `%j`', point)
	} else if (code !== codes.virtualSpace) {
	point.column++
	point.offset++
	}

	// Not in a string chunk.
	if (point._bufferIndex < 0) {
	point._index++
	} else {
	point._bufferIndex++

	// At end of string chunk.
	if (point._bufferIndex === chunks[point._index].length) {
	point._bufferIndex = -1
	point._index++
	}
	}

	// Expose the previous character.
	context.previous = code

	// Mark as consumed.
	consumed = true
	}

	// Start a token.
	function enter(type, fields) {
	var token = fields \|\| {}
	token.type = type
	token.start = now()

	assert.equal(typeof type, 'string', 'expected string type')
	assert.notEqual(type.length, 0, 'expected non-empty string')
	debug('enter: `%s`', type)

	context.events.push(['enter', token, context])

	stack.push(token)

	return token
	}

	// Stop a token.
	function exit(type) {
	assert.equal(typeof type, 'string', 'expected string type')
	assert.notEqual(type.length, 0, 'expected non-empty string')
	assert.notEqual(stack.length, 0, 'cannot close w/o open tokens')

	var token = stack.pop()
	token.end = now()

	assert.equal(type, token.type, 'expected exit token to match current token')

	assert(
	!(
	token.start._index === token.end._index &&
	token.start._bufferIndex === token.end._bufferIndex
	),
	'expected non-empty token (`' + type + '`)'
	)

	debug('exit: `%s`', token.type)
	context.events.push(['exit', token, context])

	return token
	}

	// Use results.
	function onsuccessfulconstruct(construct, info) {
	addResult(construct, info.from)
	}

	// Discard results.
	function onsuccessfulcheck(construct, info) {
	info.restore()
	}

	// Factory to attempt/check/interrupt.
	function constructFactory(onreturn, fields) {
	return hook

	// Handle either an object mapping codes to constructs, a list of
	// constructs, or a single construct.
	function hook(constructs, returnState, bogusState) {
	var listOfConstructs
	var constructIndex
	var currentConstruct
	var info

	return constructs.tokenize \|\| 'length' in constructs
	? handleListOfConstructs(miniflat(constructs))
	: handleMapOfConstructs

	function handleMapOfConstructs(code) {
	if (code in constructs \|\| codes.eof in constructs) {
	return handleListOfConstructs(
	constructs.null
	? /* c8 ignore next */
	miniflat(constructs[code]).concat(miniflat(constructs.null))
	: constructs[code]
	)(code)
	}

	return bogusState(code)
	}

	function handleListOfConstructs(list) {
	listOfConstructs = list
	constructIndex = 0
	return handleConstruct(list[constructIndex])
	}

	function handleConstruct(construct) {
	return start

	function start(code) {
	// To do: not nede to store if there is no bogus state, probably?
	// Currently doesn’t work because `inspect` in document does a check
	// w/o a bogus, which doesn’t make sense. But it does seem to help perf
	// by not storing.
	info = store()
	currentConstruct = construct

	if (!construct.partial) {
	context.currentConstruct = construct
	}

	if (
	construct.name &&
	context.parser.constructs.disable.null.indexOf(construct.name) > -1
	) {
	return nok(code)
	}

	return construct.tokenize.call(
	fields ? assign({}, context, fields) : context,
	effects,
	ok,
	nok
	)(code)
	}
	}

	function ok(code) {
	assert.equal(code, expectedCode, 'expected code')
	consumed = true
	onreturn(currentConstruct, info)
	return returnState
	}

	function nok(code) {
	assert.equal(code, expectedCode, 'expected code')
	consumed = true
	info.restore()

	if (++constructIndex < listOfConstructs.length) {
	return handleConstruct(listOfConstructs[constructIndex])
	}

	return bogusState
	}
	}
	}

	function addResult(construct, from) {
	if (construct.resolveAll && resolveAllConstructs.indexOf(construct) < 0) {
	resolveAllConstructs.push(construct)
	}

	if (construct.resolve) {
	chunkedSplice(
	context.events,
	from,
	context.events.length - from,
	construct.resolve(context.events.slice(from), context)
	)
	}

	if (construct.resolveTo) {
	context.events = construct.resolveTo(context.events, context)
	}

	assert(
	construct.partial \|\|
	!context.events.length \|\|
	context.events[context.events.length - 1][0] === 'exit',
	'expected last token to end'
	)
	}

	function store() {
	var startPoint = now()
	var startPrevious = context.previous
	var startCurrentConstruct = context.currentConstruct
	var startEventsIndex = context.events.length
	var startStack = Array.from(stack)

	return {restore: restore, from: startEventsIndex}

	function restore() {
	point = startPoint
	context.previous = startPrevious
	context.currentConstruct = startCurrentConstruct
	context.events.length = startEventsIndex
	stack = startStack
	accountForPotentialSkip()
	debug('position: restore: `%j`', point)
	}
	}

	function accountForPotentialSkip() {
	if (point.line in columnStart && point.column < 2) {
	point.column = columnStart[point.line]
	point.offset += columnStart[point.line] - 1
	}
	}
	}

create-tokenizer.mjsNo OneTemporaryActions

File Metadata

create-tokenizer.mjsView Options

Event Timeline

create-tokenizer.mjs
No OneTemporary
Actions

create-tokenizer.mjs
View Options