parse.js at cactus · ansxor.ca/markup2

ansxor.ca / markup2
fork atom
this repo has no description
fork atom
markup2 / parse.js
at cactus 707 lines 18 kB view raw
wrap content
ansxor switch from using module.exports to ESM export syntax 3w ago
41ae5452
  1/*! 𝦗𖹭
  2*/
  3"use strict"
  412||+typeof await/2//2; export default
  5/**
  6	12y2 markup parser factory
  7	@implements Parser_Collection
  8**/
  9class Markup_12y2 { constructor() {
 10
 11	const MACROS = {
 12		'{EOL}': "(?![^\\n])",
 13		'{BOL}': "^",
 14		'{ANY}': "[^]",
 15		'{URL_CHARS}': "[-\\w/%&=#+~@$*'!?,.;:]*",
 16		'{URL_FINAL}': "[-\\w/%&=#+~@$*']",
 17	}
 18	const GROUPS = []
 19	let regi = []
 20	const REGEX = function self(tem, ...groups) {
 21		if (!tem)
 22			return new RegExp(regi.join("|"), 'g')
 23		regi.push(
 24			tem.raw.join("()")
 25				.replace(/\\`/g, "`")
 26				.replace(/[(](?![?)])/g, "(?:")
 27				.replace(/[{][A-Z_]+[}]/g, match=>MACROS[match])
 28		)
 29		GROUPS.push(...groups)
 30		return self
 31	}
 32	`[\n]?[}]${'BLOCK_END'}`
 33	`[\n]${'NEWLINE'}`
 34	`{BOL}[#]{1,4}(?=[\[{ ])${'HEADING'}`
 35	`{BOL}[>](?=[\[{ ])${'QUOTE'}`
 36	`{BOL}[-]{3,}{EOL}${'DIVIDER'}`
 37	`([*][*]|[_][_]|[~][~]|[/])${'STYLE'}`
 38	`[\\]((https?|sbs)${'ESCAPED'}|[a-z]+)(?![a-zA-Z0-9])${'TAG'}`
 39	`[\\][{][\n]?${'NULL_ENV'}`
 40	`[\\]{ANY}${'ESCAPED'}`
 41	`{BOL}[\`]{3}(?!.*?[\`])${'CODE_BLOCK'}`
 42	`[\`][^\`\n]*([\`]{2}[^\`\n]*)*[\`]?${'INLINE_CODE'}`
 43	`([!]${'EMBED'})?\b(https?://|sbs:){URL_CHARS}{URL_FINAL}([(]{URL_CHARS}[)]({URL_CHARS}{URL_FINAL})?)?${'LINK'}`
 44	`{BOL}[|][-][-+]*[-][|]{EOL}${'TABLE_DIVIDER'}` // `{BOL}[|][|][|]{EOL}${'TABLE_DIVIDER'}`
 45	`{BOL} *[|]${'TABLE_START'}`
 46	` *[|][|]?${'TABLE_CELL'}`
 47	`{BOL} *[-]${'LIST_ITEM'}`
 48	()
 49	
 50	//todo: org tables separators?
 51	// what if we make them enable an ascii art table parsing mode
 52	// like
 53	// | heck | 123 |
 54	// |------+------|
 55	// | line1 | aaa |
 56	// | line2 | bbb |
 57	// creates 2 cells, with 2 lines each, rather than 2 rows.
 58	// i.e: each added row will just append its contents to the cells
 59	// of the previous row.
 60	// maybe this should be an arg instead? on a row, to merge it with prev or etc..
 61	
 62
 63	// all state is stored in these vars (and REGEX.lastIndex)
 64	let current, brackets
 65	
 66	// About __proto__ in object literals:
 67	// https://tc39.es/ecma262/multipage/ecmascript-language-expressions.html#sec-runtime-semantics-propertydefinitionevaluation
 68	const IS_BLOCK = {__proto__:null, code:'block', divider:'block', ROOT:'block', heading:'block', quote:'block', table:'block', table_cell:'block', image:'block', video:'block', audio:'block', spoiler:'block', align:'block', list:'block', list_item:'block', youtube:'block', anchor:'block', table_divider:'block', ruby:'text', key:'text'}
 69	// 'text' is for inline-block elements
 70	
 71
 72	// argument processing //
 73	
 74	const NO_ARGS = []
 75	NO_ARGS.named = Object.freeze({})
 76	Object.freeze(NO_ARGS)
 77	// todo: do we even need named args?
 78	const parse_args=(arglist)=>{
 79		let list = [], named = {}
 80		list.named = named
 81		for (let arg of arglist.split(";")) {
 82			let [, name, value] = /^(?:([-\w]*)=)?(.*)$/.exec(arg)
 83			// value OR =value
 84			// (this is to allow values to contain =. ex: [=1=2] is "1=2")
 85			if (!name)
 86				list.push(value)
 87			else // name=value
 88				named[name] = value
 89		}
 90		return list
 91	}
 92	
 93	// process an embed url: !https://example.com/image.png[alt=balls]
 94	// returns [type: String, args: Object]
 95	const process_embed=(url, rargs)=>{
 96		let type
 97		let args = {url}
 98		for (let arg of rargs) {
 99			let m
100			if ('video'===arg || 'audio'===arg || 'image'===arg) {
101				type = arg
102			} else if (m = /^(\d+)x(\d+)$/.exec(arg)) {
103				args.width = +m[1]
104				args.height = +m[2]
105			} else {
106				if (args.alt==undefined)
107					args.alt = arg
108				else
109					args.alt += ";"+arg
110			}
111		}
112		if (rargs.named.alt!=undefined)
113			args.alt = rargs.named.alt
114		// todo: improve this
115		if (!type) {
116			if (/[.](mp3|ogg|wav|m4a|flac|aac|oga|opus|wma)\b/i.test(url))
117				type = 'audio'
118			else if (/[.](mp4|mkv|mov|webm|avi|flv|m4v|mpeg|mpg|ogv|ogm|ogx|wmv|xvid)\b/i.test(url))
119				type = 'video'
120			else if (/^https?:[/][/](?:www[.]|music[.])?(?:youtube.com[/]watch[?]v=|youtu[.]be[/]|youtube.com[/]shorts[/])[\w-]{11}/.test(url)) {
121				// todo: accept [start-end] args maybe?
122				type = 'youtube'
123			}
124		}
125		if (!type)
126			type = 'image'
127		return [type, args]
128	}
129	const is_color=(arg)=>{
130		return ['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'gray'].includes(arg)
131	}
132	const process_cell_args=(rargs)=>{
133		let args = {}
134		for (let arg of rargs) {
135			let m
136			if ("*"===arg || "#"===arg)
137				args.header = true
138			else if ("-div"===arg)
139				args.div = true
140			else if (is_color(arg))
141				args.color = arg
142			else if (m = /^(\d*)x(\d*)$/.exec(arg)) {
143				let [, w, h] = m
144				if (+w > 1) args.colspan = +w
145				if (+h > 1) args.rowspan = +h
146			}
147		}
148		return args
149	}
150	const process_row_args=(rargs)=>{
151		let args = {}
152		for (let arg of rargs) {
153			if ("*"===arg || "#"===arg)
154				args.header = true
155		}
156		return args
157	}
158
159	// tree operations //
160	
161	const pop=()=>{
162		if (current.body)
163			brackets--
164		let o = current
165		current = current.parent
166		return o
167	}
168	
169	const get_last=(block)=>{
170		return block.content[block.content.length-1]
171	}
172	
173	const push=(dest, type, args, content)=>{
174		let node = {type, args, content}
175		dest.content.push(node)
176		return node
177	}
178	
179	// push text
180	const TEXT=(text)=>{
181		if ('block'===current.prev)
182			text = text.replace(/^ +/, "")
183		if (text!=="") {
184			current.content.push(text) // todo: merge with surrounding textnodes?
185			current.prev = 'text'
186		}
187	}
188	
189	const CLOSE=(cancel)=>{
190		let o = pop()
191		let type = o.type
192		
193		//if ('newline'===o.prev)
194		//	o.content.push("\n")
195		
196		switch (type) { default: {
197			push(current, type, o.args, o.content)
198		} break; case 'style': {
199			if (cancel) {
200				TEXT(o.args)
201				current.content.push(...o.content)
202			} else {
203				type = {
204					__proto__:null,
205					'**': 'bold', '__': 'underline',
206					'~~': 'strikethrough', '/': 'italic',
207				}[o.args]
208				push(current, type, null, o.content)
209			}
210		} break; case 'null_env': {
211			current.content.push(...o.content)
212		} break; case 'table_divider': {
213			let above = get_last(current)
214			if (above && 'table'===above.type) {
215				above.args = {divider:true}
216			}
217		} break; case 'table_cell': {
218			// push cell if not empty
219			if (!cancel || o.content.length) {
220				push(current, type, process_cell_args(o.args), o.content)
221				current.prev = 'block'
222			}
223			// cancelled = next row
224			if (cancel) {
225				// empty cell -> parse arguments as row arguments
226				if (!o.content.length) {
227					// exception: empty row -> cancel table
228					if (!current.content.length) {
229						let o = pop()
230						TEXT(o.args)
231						return
232						// todo: maybe also cancel rows with 1 unclosed cell?
233						// like `| abc` -> text
234					}
235					current.args = process_row_args(o.args)
236				} else
237					current.args = {}
238				CLOSE(true)
239				return
240			}
241		} break; case 'list_item': {
242			// merge list_item with preceeding list
243			let dest = current
244			let indent = o.args.indent
245			do {
246				let curr = dest
247				dest = get_last(curr)
248				if (!dest || 'list'!==dest.type || dest.args.indent>indent) {
249					// create a new level in the list
250					dest = push(curr, 'list', {indent, style:o.args.kind}, [])
251					break
252				}
253			} while (dest.args.indent != indent)
254			push(dest, type, null, o.content)
255		} break; case 'table_row': {
256			let dest = get_last(current)
257			if (!dest || 'table'!==dest.type) {
258				dest = push(current, 'table', null, [])
259			} else {
260				if (dest.args && dest.args.divider) {
261					delete dest.args.divider
262					o.args.divider = true
263				}
264			}
265			push(dest, type, o.args, o.content)
266		} }
267		
268		current.prev = IS_BLOCK[type] || o.prev
269	}
270	
271	// push empty tag
272	const BLOCK=(type, args)=>{
273		current.content.push({type, args})
274		current.prev = IS_BLOCK[type] || 'text'
275	}
276	
277	const NEWLINE=(real)=>{
278		if (real)
279			while (!current.body && 'ROOT'!=current.type)
280				CLOSE(true)
281		if ('block'!==current.prev)
282			current.content.push("\n")
283		if ('all_newline'!==current.prev)
284			current.prev = 'newline'
285	}
286	
287
288	// parsing //
289	
290	const STYLE_START
291		= /^[\s,][^\s,]|^['"}{(>|][^\s,'"]/
292	const STYLE_END
293		= /^[^\s,][-\s.,:;!?'"}{)<\\|]/
294	const ITALIC_START
295		= /^[\s,][^\s,/]|^['"}{(|][^\s,'"/<]/
296	const ITALIC_END
297		= /^[^\s,/>][-\s.,:;!?'"}{)\\|]/
298	// wait, shouldn't \./heck/\. be allowed though? but that wouldn't work since `.` isn't allowed before..
299	
300	const find_style=(token)=>{
301		for (let c=current; 'style'===c.type; c=c.parent)
302			if (c.args===token)
303				return c
304	}
305	
306	const check_style=(token, before, after)=>{
307		let ital = "/"===token
308		let c = find_style(token)
309		if (c && (ital ? ITALIC_END : STYLE_END).test(before+after))
310			return c
311		if ((ital ? ITALIC_START : STYLE_START).test(before+after))
312			return true
313	}
314	const ARG_REGEX = /.*?(?=])/y
315	const WORD_REGEX = /[^\s`^()+=\[\]{}\\|"';:,.<>/?!*]*/y
316	const CODE_REGEX = /(?: *([-\w.+#$ ]+?) *(?![^\n]))?\n?([^]*?)(?:\n?```|$)/y // ack
317	
318	const parse=(text)=>{
319		let tree = {type: 'ROOT', content: [], prev: 'all_newline'}
320		current = tree
321		brackets = 0
322		
323		// these use REGEX, text
324		const skip_spaces=()=>{
325			let pos = REGEX.lastIndex
326			while (" "===text.charAt(pos))
327				pos++
328			REGEX.lastIndex = pos
329		}
330		const read_code=()=>{
331			let pos = REGEX.lastIndex
332			CODE_REGEX.lastIndex = pos
333			let [, lang, code] = CODE_REGEX.exec(text)
334			REGEX.lastIndex = CODE_REGEX.lastIndex
335			return [lang, code]
336		}
337		
338		let rargs
339		const read_args=()=>{
340			let pos = REGEX.lastIndex
341			let next = text.charAt(pos)
342			if ("["!==next)
343				return rargs = NO_ARGS
344			ARG_REGEX.lastIndex = pos+1
345			let argstr = ARG_REGEX.exec(text)
346			if (!argstr)
347				return rargs = NO_ARGS
348			REGEX.lastIndex = ARG_REGEX.lastIndex+1
349			return rargs = parse_args(argstr[0])
350		}
351		
352		let body
353		const read_body=(space=false)=>{
354			let pos = REGEX.lastIndex
355			let next = text.charAt(pos)
356			if ("{"===next) {
357				if ("\n"===text.charAt(pos+1))
358					pos++
359				REGEX.lastIndex = pos+1
360				return body = true
361			}
362			if (space) {
363				if (" "===next)
364					REGEX.lastIndex = pos+1
365				else
366					return body = false
367			}
368			return body = undefined
369		}
370		// start a new block
371		const OPEN=(type, args=null)=>{
372			current = Object.seal({
373				type, args, content: [],
374				body, parent: current,
375				prev: 'all_newline',
376			})
377			if (body)
378				brackets++
379		}
380		const word_maybe=()=>{
381			if (!body) {
382				TEXT(read_word())
383				CLOSE()
384			}
385		}
386		
387		let match
388		let last = REGEX.lastIndex = 0
389		const NEVERMIND=(index=match.index+1)=>{
390			REGEX.lastIndex = index
391		}
392		const ACCEPT=()=>{
393			TEXT(text.substring(last, match.index))
394			last = REGEX.lastIndex
395		}
396		const read_word=()=>{
397			let pos = REGEX.lastIndex
398			WORD_REGEX.lastIndex = pos
399			let word = WORD_REGEX.exec(text)
400			if (!word)
401				return null
402			last = REGEX.lastIndex = WORD_REGEX.lastIndex
403			return word[0]
404		}
405		
406		let prev = -1
407		main: while (match = REGEX.exec(text)) {
408			// check for infinite loops
409			if (match.index===prev)
410				throw ["INFINITE LOOP", match]
411			prev = match.index
412			// 2: figure out which token type was matched
413			let token = match[0]
414			let group_num = match.indexOf("", 1)-1
415			let type = GROUPS[group_num]
416			// 3: 
417			body = null
418			rargs = null
419
420			switch (type) {
421			case 'TAG': {
422				read_args()
423				if (token==='\\link') {
424					read_body(false)
425				} else {
426					read_body(true)
427					if (NO_ARGS===rargs && false===body) {
428						NEVERMIND()
429						continue main
430					}
431				}
432				ACCEPT()
433				switch (token) { default: {
434					let args = {text:text.substring(match.index, last), reason:"invalid tag"}
435					if (body)
436						OPEN('invalid', args)
437					else
438						BLOCK('invalid', args)
439				} break; case '\\sub': {
440					OPEN('subscript')
441					word_maybe()
442				} break; case '\\sup': {
443					OPEN('superscript')
444					word_maybe()
445				} break; case '\\sm': {
446					OPEN('small')
447					word_maybe()
448				} break; case '\\sc': {
449					OPEN('small_caps')
450					word_maybe()
451				} break; case '\\ov': {
452					OPEN('overline')
453					word_maybe()
454				} break; case '\\b': {
455					OPEN('bold')
456					word_maybe()
457				} break; case '\\i': {
458					OPEN('italic')
459					word_maybe()
460				} break; case '\\u': {
461					OPEN('underline')
462					word_maybe()
463				} break; case '\\s': {
464					OPEN('strikethrough')
465					word_maybe()
466				} break; case '\\quote': {
467					OPEN('quote', {cite: rargs[0]})
468				} break; case '\\align': {
469					let a = rargs[0]
470					if (!['left', 'right', 'center'].includes(a))
471						a = 'center'
472					OPEN('align', {align: a})
473				} break; case '\\spoiler': case '\\h': {
474					let [label="spoiler"] = rargs
475					let cw = /\bcw\b|🔞/i.test(label)
476					OPEN('spoiler', {label, cw})
477				} break; case '\\ruby': {
478					let [txt="true"] = rargs
479					OPEN('ruby', {text: txt})
480					word_maybe()
481				} break; case '\\key': {
482					OPEN('key')
483					word_maybe()
484				} break; case '\\a': {
485					let id = rargs[0]
486					id = id ? id.replace(/\W+/g, "-") : null
487					OPEN('anchor', {id})
488					body = true // ghhhh?
489					//BLOCK('anchor', {id})
490				} break; case '\\link': {
491					let [url=""] = rargs
492					let args = {url}
493					if (body) {
494						OPEN('link', args)
495					} else {
496						BLOCK('simple_link', args)
497					}
498				} break; case '\\bg': {
499					let color = rargs[0]
500					if (!is_color(color))
501						color = null
502					OPEN('background_color', {color})
503				} break; case '\\lang': {
504					let [lang=""] = rargs
505					OPEN('language', {lang})
506					word_maybe()
507				}}
508			} break; case 'STYLE': {
509				let c = check_style(token, text.charAt(match.index-1)||"\n", text.charAt(REGEX.lastIndex)||"\n")
510				if (!c) { // no
511					NEVERMIND()
512					continue main
513				}
514				ACCEPT()
515				if (true===c) { // open new
516					OPEN('style', token)
517				} else { // close
518					while (current != c)
519						CLOSE(true)
520					CLOSE()
521				}
522			} break; case 'TABLE_CELL': {
523				for (let c=current; ; c=c.parent) {
524					if ('table_cell'===c.type) {
525						read_args()
526						skip_spaces()
527						ACCEPT()
528						while (current!==c)
529							CLOSE(true)
530						CLOSE() // cell
531						// TODO: HACK
532						if (/^ *[|][|]/.test(token)) {
533							let last = current.content[current.content.length-1]
534							last.args.div = true
535						}
536						// we don't know whether these are row args or cell args,
537						// so just pass the raw args directly, and parse them later.
538						OPEN('table_cell', rargs)
539						break
540					}
541					if ('style'!==c.type) {
542						// normally NEVERMIND skips one char,
543						// e.g. if we parse "abc" and that matches but gets rejected, it'll try parsing at "bc".
544						// but table cell tokens can look like this: "   ||"
545						// if we skip 1 char (a space), it would try to parse a table cell again several times.
546						// so instead we skip to the end of the token because we know it's safe in this case.
547						NEVERMIND(REGEX.lastIndex)
548						continue main
549					}
550				}
551			} break; case 'TABLE_DIVIDER': {
552				//skip_spaces()
553				let tbl = get_last(current)
554				if (!tbl || 'table'!==tbl.type) {
555					NEVERMIND()
556					continue main
557				}
558				ACCEPT()
559				OPEN('table_divider')
560			} break; case 'TABLE_START': {
561				read_args()
562				skip_spaces()
563				ACCEPT()
564				let args_token = text.substring(match.index, last)
565				OPEN('table_row', args_token, false) // special OPEN call
566				OPEN('table_cell', rargs)
567			} break; case 'NEWLINE': {
568				ACCEPT()
569				NEWLINE(true)
570				body = true // to trigger start_line
571			} break; case 'HEADING': {
572				read_args()
573				read_body(true)
574				if (NO_ARGS===rargs && false===body) {
575					NEVERMIND()
576					continue main
577				}
578				ACCEPT()
579				let level = token.length
580				let args = {level}
581				let id = rargs[0]
582				args.id = id ? id.replace(/\W+/g, "-") : null
583				// todo: anchor name (and, can this be chosen automatically based on contents?)
584				OPEN('heading', args)
585			} break; case 'DIVIDER': {
586				ACCEPT()
587				BLOCK('divider')
588			} break; case 'BLOCK_END': {
589				ACCEPT()
590				if (brackets>0) {
591					while (!current.body)
592						CLOSE(true)
593					if ('invalid'===current.type) {
594						if ("\n}"==token)
595							NEWLINE(false) // false since we already closed everything
596						TEXT("}")
597					}
598					CLOSE()
599				} else {
600					// hack:
601					if ("\n}"==token)
602						NEWLINE(true)
603					TEXT("}")
604				}
605			} break; case 'NULL_ENV': {
606				body = true
607				ACCEPT()
608				OPEN('null_env')
609				current.prev = current.parent.prev
610			} break; case 'ESCAPED': {
611				ACCEPT()
612				if ("\\\n"===token)
613					NEWLINE(false)
614				else if ("\\."===token) { // \. is a no-op
615					// todo: close lists too
616					//current.content.push("")
617					//current.prev = 'block'
618				} else {
619					current.content.push(token.slice(1))
620					current.prev = 'text'
621				}
622			} break; case 'QUOTE': {
623				read_args()
624				read_body(true)
625				if (NO_ARGS===rargs && false===body) {
626					NEVERMIND()
627					continue main
628				}
629				ACCEPT()
630				OPEN('quote', {cite: rargs[0]})
631			} break; case 'CODE_BLOCK': {
632				let [lang, code] = read_code()
633				ACCEPT()
634				BLOCK('code', {text:code, lang})
635			} break; case 'INLINE_CODE': {
636				ACCEPT()
637				BLOCK('icode', {text: token.replace(/^`|`$/g, "").replace(/``/g, "`")})
638			} break; case 'EMBED': {
639				read_args()
640				ACCEPT()
641				let url = token.substring(1) // ehh better
642				let [type, args] = process_embed(url, rargs)
643				BLOCK(type, args)
644			} break; case 'LINK': {
645				read_args()
646				read_body(false)
647				ACCEPT()
648				let url = token
649				let args = {url}
650				if (body) {
651					OPEN('link', args)
652				} else {
653					args.text = rargs[0]
654					BLOCK('simple_link', args)
655				}
656			} break; case 'LIST_ITEM': {
657				read_args()
658				read_body(true)
659				if (NO_ARGS===rargs && false===body) {
660					NEVERMIND()
661					continue main
662				}
663				ACCEPT()
664				let indent = token.indexOf("-")
665				OPEN('list_item', {indent, kind:rargs[0]==="1"?"1":undefined})
666			} }
667			
668			if (body) {
669				text = text.substring(last)
670				last = REGEX.lastIndex = 0
671				prev = -1
672			}
673		} // end of main loop
674		
675		TEXT(text.substring(last)) // text after last token
676		
677		while ('ROOT'!==current.type)
678			CLOSE(true)
679		if ('newline'===current.prev)
680			current.content.push("\n")
681		
682		current = null // my the memory leak!
683		
684		return tree
685	} /* parse() */
686	
687	this.parse = parse
688	this.langs = {'12y2': parse}
689} }
690
691export default Markup_12y2
692
693// what if you want to write like, "{...}". well that's fine
694// BUT if you are inside a tag, the } will close it.
695// maybe closing tags should need some kind of special syntax?
696// \tag{ ... \}  >{...\} idk..
697// or match paired {}s :
698// \tag{ ...  {heck} ... } <- closes here
699
700// todo: after parsing a block element: eat the next newline directly
701
702// idea:
703// compare ast formats:
704// memory, speed, etc.
705// {type, args, content}
706// [type, args, content]
707// [type, args, ...content]