this repo has no description
at cactus 707 lines 18 kB view raw
1/*! 𝦗𖹭 2*/ 3"use strict" 412||+typeof await/2//2; export default 5/** 6 12y2 markup parser factory 7 @implements Parser_Collection 8**/ 9class Markup_12y2 { constructor() { 10 11 const MACROS = { 12 '{EOL}': "(?![^\\n])", 13 '{BOL}': "^", 14 '{ANY}': "[^]", 15 '{URL_CHARS}': "[-\\w/%&=#+~@$*'!?,.;:]*", 16 '{URL_FINAL}': "[-\\w/%&=#+~@$*']", 17 } 18 const GROUPS = [] 19 let regi = [] 20 const REGEX = function self(tem, ...groups) { 21 if (!tem) 22 return new RegExp(regi.join("|"), 'g') 23 regi.push( 24 tem.raw.join("()") 25 .replace(/\\`/g, "`") 26 .replace(/[(](?![?)])/g, "(?:") 27 .replace(/[{][A-Z_]+[}]/g, match=>MACROS[match]) 28 ) 29 GROUPS.push(...groups) 30 return self 31 } 32 `[\n]?[}]${'BLOCK_END'}` 33 `[\n]${'NEWLINE'}` 34 `{BOL}[#]{1,4}(?=[\[{ ])${'HEADING'}` 35 `{BOL}[>](?=[\[{ ])${'QUOTE'}` 36 `{BOL}[-]{3,}{EOL}${'DIVIDER'}` 37 `([*][*]|[_][_]|[~][~]|[/])${'STYLE'}` 38 `[\\]((https?|sbs)${'ESCAPED'}|[a-z]+)(?![a-zA-Z0-9])${'TAG'}` 39 `[\\][{][\n]?${'NULL_ENV'}` 40 `[\\]{ANY}${'ESCAPED'}` 41 `{BOL}[\`]{3}(?!.*?[\`])${'CODE_BLOCK'}` 42 `[\`][^\`\n]*([\`]{2}[^\`\n]*)*[\`]?${'INLINE_CODE'}` 43 `([!]${'EMBED'})?\b(https?://|sbs:){URL_CHARS}{URL_FINAL}([(]{URL_CHARS}[)]({URL_CHARS}{URL_FINAL})?)?${'LINK'}` 44 `{BOL}[|][-][-+]*[-][|]{EOL}${'TABLE_DIVIDER'}` // `{BOL}[|][|][|]{EOL}${'TABLE_DIVIDER'}` 45 `{BOL} *[|]${'TABLE_START'}` 46 ` *[|][|]?${'TABLE_CELL'}` 47 `{BOL} *[-]${'LIST_ITEM'}` 48 () 49 50 //todo: org tables separators? 51 // what if we make them enable an ascii art table parsing mode 52 // like 53 // | heck | 123 | 54 // |------+------| 55 // | line1 | aaa | 56 // | line2 | bbb | 57 // creates 2 cells, with 2 lines each, rather than 2 rows. 58 // i.e: each added row will just append its contents to the cells 59 // of the previous row. 60 // maybe this should be an arg instead? on a row, to merge it with prev or etc.. 61 62 63 // all state is stored in these vars (and REGEX.lastIndex) 64 let current, brackets 65 66 // About __proto__ in object literals: 67 // https://tc39.es/ecma262/multipage/ecmascript-language-expressions.html#sec-runtime-semantics-propertydefinitionevaluation 68 const IS_BLOCK = {__proto__:null, code:'block', divider:'block', ROOT:'block', heading:'block', quote:'block', table:'block', table_cell:'block', image:'block', video:'block', audio:'block', spoiler:'block', align:'block', list:'block', list_item:'block', youtube:'block', anchor:'block', table_divider:'block', ruby:'text', key:'text'} 69 // 'text' is for inline-block elements 70 71 72 // argument processing // 73 74 const NO_ARGS = [] 75 NO_ARGS.named = Object.freeze({}) 76 Object.freeze(NO_ARGS) 77 // todo: do we even need named args? 78 const parse_args=(arglist)=>{ 79 let list = [], named = {} 80 list.named = named 81 for (let arg of arglist.split(";")) { 82 let [, name, value] = /^(?:([-\w]*)=)?(.*)$/.exec(arg) 83 // value OR =value 84 // (this is to allow values to contain =. ex: [=1=2] is "1=2") 85 if (!name) 86 list.push(value) 87 else // name=value 88 named[name] = value 89 } 90 return list 91 } 92 93 // process an embed url: !https://example.com/image.png[alt=balls] 94 // returns [type: String, args: Object] 95 const process_embed=(url, rargs)=>{ 96 let type 97 let args = {url} 98 for (let arg of rargs) { 99 let m 100 if ('video'===arg || 'audio'===arg || 'image'===arg) { 101 type = arg 102 } else if (m = /^(\d+)x(\d+)$/.exec(arg)) { 103 args.width = +m[1] 104 args.height = +m[2] 105 } else { 106 if (args.alt==undefined) 107 args.alt = arg 108 else 109 args.alt += ";"+arg 110 } 111 } 112 if (rargs.named.alt!=undefined) 113 args.alt = rargs.named.alt 114 // todo: improve this 115 if (!type) { 116 if (/[.](mp3|ogg|wav|m4a|flac|aac|oga|opus|wma)\b/i.test(url)) 117 type = 'audio' 118 else if (/[.](mp4|mkv|mov|webm|avi|flv|m4v|mpeg|mpg|ogv|ogm|ogx|wmv|xvid)\b/i.test(url)) 119 type = 'video' 120 else if (/^https?:[/][/](?:www[.]|music[.])?(?:youtube.com[/]watch[?]v=|youtu[.]be[/]|youtube.com[/]shorts[/])[\w-]{11}/.test(url)) { 121 // todo: accept [start-end] args maybe? 122 type = 'youtube' 123 } 124 } 125 if (!type) 126 type = 'image' 127 return [type, args] 128 } 129 const is_color=(arg)=>{ 130 return ['red', 'orange', 'yellow', 'green', 'blue', 'purple', 'gray'].includes(arg) 131 } 132 const process_cell_args=(rargs)=>{ 133 let args = {} 134 for (let arg of rargs) { 135 let m 136 if ("*"===arg || "#"===arg) 137 args.header = true 138 else if ("-div"===arg) 139 args.div = true 140 else if (is_color(arg)) 141 args.color = arg 142 else if (m = /^(\d*)x(\d*)$/.exec(arg)) { 143 let [, w, h] = m 144 if (+w > 1) args.colspan = +w 145 if (+h > 1) args.rowspan = +h 146 } 147 } 148 return args 149 } 150 const process_row_args=(rargs)=>{ 151 let args = {} 152 for (let arg of rargs) { 153 if ("*"===arg || "#"===arg) 154 args.header = true 155 } 156 return args 157 } 158 159 // tree operations // 160 161 const pop=()=>{ 162 if (current.body) 163 brackets-- 164 let o = current 165 current = current.parent 166 return o 167 } 168 169 const get_last=(block)=>{ 170 return block.content[block.content.length-1] 171 } 172 173 const push=(dest, type, args, content)=>{ 174 let node = {type, args, content} 175 dest.content.push(node) 176 return node 177 } 178 179 // push text 180 const TEXT=(text)=>{ 181 if ('block'===current.prev) 182 text = text.replace(/^ +/, "") 183 if (text!=="") { 184 current.content.push(text) // todo: merge with surrounding textnodes? 185 current.prev = 'text' 186 } 187 } 188 189 const CLOSE=(cancel)=>{ 190 let o = pop() 191 let type = o.type 192 193 //if ('newline'===o.prev) 194 // o.content.push("\n") 195 196 switch (type) { default: { 197 push(current, type, o.args, o.content) 198 } break; case 'style': { 199 if (cancel) { 200 TEXT(o.args) 201 current.content.push(...o.content) 202 } else { 203 type = { 204 __proto__:null, 205 '**': 'bold', '__': 'underline', 206 '~~': 'strikethrough', '/': 'italic', 207 }[o.args] 208 push(current, type, null, o.content) 209 } 210 } break; case 'null_env': { 211 current.content.push(...o.content) 212 } break; case 'table_divider': { 213 let above = get_last(current) 214 if (above && 'table'===above.type) { 215 above.args = {divider:true} 216 } 217 } break; case 'table_cell': { 218 // push cell if not empty 219 if (!cancel || o.content.length) { 220 push(current, type, process_cell_args(o.args), o.content) 221 current.prev = 'block' 222 } 223 // cancelled = next row 224 if (cancel) { 225 // empty cell -> parse arguments as row arguments 226 if (!o.content.length) { 227 // exception: empty row -> cancel table 228 if (!current.content.length) { 229 let o = pop() 230 TEXT(o.args) 231 return 232 // todo: maybe also cancel rows with 1 unclosed cell? 233 // like `| abc` -> text 234 } 235 current.args = process_row_args(o.args) 236 } else 237 current.args = {} 238 CLOSE(true) 239 return 240 } 241 } break; case 'list_item': { 242 // merge list_item with preceeding list 243 let dest = current 244 let indent = o.args.indent 245 do { 246 let curr = dest 247 dest = get_last(curr) 248 if (!dest || 'list'!==dest.type || dest.args.indent>indent) { 249 // create a new level in the list 250 dest = push(curr, 'list', {indent, style:o.args.kind}, []) 251 break 252 } 253 } while (dest.args.indent != indent) 254 push(dest, type, null, o.content) 255 } break; case 'table_row': { 256 let dest = get_last(current) 257 if (!dest || 'table'!==dest.type) { 258 dest = push(current, 'table', null, []) 259 } else { 260 if (dest.args && dest.args.divider) { 261 delete dest.args.divider 262 o.args.divider = true 263 } 264 } 265 push(dest, type, o.args, o.content) 266 } } 267 268 current.prev = IS_BLOCK[type] || o.prev 269 } 270 271 // push empty tag 272 const BLOCK=(type, args)=>{ 273 current.content.push({type, args}) 274 current.prev = IS_BLOCK[type] || 'text' 275 } 276 277 const NEWLINE=(real)=>{ 278 if (real) 279 while (!current.body && 'ROOT'!=current.type) 280 CLOSE(true) 281 if ('block'!==current.prev) 282 current.content.push("\n") 283 if ('all_newline'!==current.prev) 284 current.prev = 'newline' 285 } 286 287 288 // parsing // 289 290 const STYLE_START 291 = /^[\s,][^\s,]|^['"}{(>|][^\s,'"]/ 292 const STYLE_END 293 = /^[^\s,][-\s.,:;!?'"}{)<\\|]/ 294 const ITALIC_START 295 = /^[\s,][^\s,/]|^['"}{(|][^\s,'"/<]/ 296 const ITALIC_END 297 = /^[^\s,/>][-\s.,:;!?'"}{)\\|]/ 298 // wait, shouldn't \./heck/\. be allowed though? but that wouldn't work since `.` isn't allowed before.. 299 300 const find_style=(token)=>{ 301 for (let c=current; 'style'===c.type; c=c.parent) 302 if (c.args===token) 303 return c 304 } 305 306 const check_style=(token, before, after)=>{ 307 let ital = "/"===token 308 let c = find_style(token) 309 if (c && (ital ? ITALIC_END : STYLE_END).test(before+after)) 310 return c 311 if ((ital ? ITALIC_START : STYLE_START).test(before+after)) 312 return true 313 } 314 const ARG_REGEX = /.*?(?=])/y 315 const WORD_REGEX = /[^\s`^()+=\[\]{}\\|"';:,.<>/?!*]*/y 316 const CODE_REGEX = /(?: *([-\w.+#$ ]+?) *(?![^\n]))?\n?([^]*?)(?:\n?```|$)/y // ack 317 318 const parse=(text)=>{ 319 let tree = {type: 'ROOT', content: [], prev: 'all_newline'} 320 current = tree 321 brackets = 0 322 323 // these use REGEX, text 324 const skip_spaces=()=>{ 325 let pos = REGEX.lastIndex 326 while (" "===text.charAt(pos)) 327 pos++ 328 REGEX.lastIndex = pos 329 } 330 const read_code=()=>{ 331 let pos = REGEX.lastIndex 332 CODE_REGEX.lastIndex = pos 333 let [, lang, code] = CODE_REGEX.exec(text) 334 REGEX.lastIndex = CODE_REGEX.lastIndex 335 return [lang, code] 336 } 337 338 let rargs 339 const read_args=()=>{ 340 let pos = REGEX.lastIndex 341 let next = text.charAt(pos) 342 if ("["!==next) 343 return rargs = NO_ARGS 344 ARG_REGEX.lastIndex = pos+1 345 let argstr = ARG_REGEX.exec(text) 346 if (!argstr) 347 return rargs = NO_ARGS 348 REGEX.lastIndex = ARG_REGEX.lastIndex+1 349 return rargs = parse_args(argstr[0]) 350 } 351 352 let body 353 const read_body=(space=false)=>{ 354 let pos = REGEX.lastIndex 355 let next = text.charAt(pos) 356 if ("{"===next) { 357 if ("\n"===text.charAt(pos+1)) 358 pos++ 359 REGEX.lastIndex = pos+1 360 return body = true 361 } 362 if (space) { 363 if (" "===next) 364 REGEX.lastIndex = pos+1 365 else 366 return body = false 367 } 368 return body = undefined 369 } 370 // start a new block 371 const OPEN=(type, args=null)=>{ 372 current = Object.seal({ 373 type, args, content: [], 374 body, parent: current, 375 prev: 'all_newline', 376 }) 377 if (body) 378 brackets++ 379 } 380 const word_maybe=()=>{ 381 if (!body) { 382 TEXT(read_word()) 383 CLOSE() 384 } 385 } 386 387 let match 388 let last = REGEX.lastIndex = 0 389 const NEVERMIND=(index=match.index+1)=>{ 390 REGEX.lastIndex = index 391 } 392 const ACCEPT=()=>{ 393 TEXT(text.substring(last, match.index)) 394 last = REGEX.lastIndex 395 } 396 const read_word=()=>{ 397 let pos = REGEX.lastIndex 398 WORD_REGEX.lastIndex = pos 399 let word = WORD_REGEX.exec(text) 400 if (!word) 401 return null 402 last = REGEX.lastIndex = WORD_REGEX.lastIndex 403 return word[0] 404 } 405 406 let prev = -1 407 main: while (match = REGEX.exec(text)) { 408 // check for infinite loops 409 if (match.index===prev) 410 throw ["INFINITE LOOP", match] 411 prev = match.index 412 // 2: figure out which token type was matched 413 let token = match[0] 414 let group_num = match.indexOf("", 1)-1 415 let type = GROUPS[group_num] 416 // 3: 417 body = null 418 rargs = null 419 420 switch (type) { 421 case 'TAG': { 422 read_args() 423 if (token==='\\link') { 424 read_body(false) 425 } else { 426 read_body(true) 427 if (NO_ARGS===rargs && false===body) { 428 NEVERMIND() 429 continue main 430 } 431 } 432 ACCEPT() 433 switch (token) { default: { 434 let args = {text:text.substring(match.index, last), reason:"invalid tag"} 435 if (body) 436 OPEN('invalid', args) 437 else 438 BLOCK('invalid', args) 439 } break; case '\\sub': { 440 OPEN('subscript') 441 word_maybe() 442 } break; case '\\sup': { 443 OPEN('superscript') 444 word_maybe() 445 } break; case '\\sm': { 446 OPEN('small') 447 word_maybe() 448 } break; case '\\sc': { 449 OPEN('small_caps') 450 word_maybe() 451 } break; case '\\ov': { 452 OPEN('overline') 453 word_maybe() 454 } break; case '\\b': { 455 OPEN('bold') 456 word_maybe() 457 } break; case '\\i': { 458 OPEN('italic') 459 word_maybe() 460 } break; case '\\u': { 461 OPEN('underline') 462 word_maybe() 463 } break; case '\\s': { 464 OPEN('strikethrough') 465 word_maybe() 466 } break; case '\\quote': { 467 OPEN('quote', {cite: rargs[0]}) 468 } break; case '\\align': { 469 let a = rargs[0] 470 if (!['left', 'right', 'center'].includes(a)) 471 a = 'center' 472 OPEN('align', {align: a}) 473 } break; case '\\spoiler': case '\\h': { 474 let [label="spoiler"] = rargs 475 let cw = /\bcw\b|🔞/i.test(label) 476 OPEN('spoiler', {label, cw}) 477 } break; case '\\ruby': { 478 let [txt="true"] = rargs 479 OPEN('ruby', {text: txt}) 480 word_maybe() 481 } break; case '\\key': { 482 OPEN('key') 483 word_maybe() 484 } break; case '\\a': { 485 let id = rargs[0] 486 id = id ? id.replace(/\W+/g, "-") : null 487 OPEN('anchor', {id}) 488 body = true // ghhhh? 489 //BLOCK('anchor', {id}) 490 } break; case '\\link': { 491 let [url=""] = rargs 492 let args = {url} 493 if (body) { 494 OPEN('link', args) 495 } else { 496 BLOCK('simple_link', args) 497 } 498 } break; case '\\bg': { 499 let color = rargs[0] 500 if (!is_color(color)) 501 color = null 502 OPEN('background_color', {color}) 503 } break; case '\\lang': { 504 let [lang=""] = rargs 505 OPEN('language', {lang}) 506 word_maybe() 507 }} 508 } break; case 'STYLE': { 509 let c = check_style(token, text.charAt(match.index-1)||"\n", text.charAt(REGEX.lastIndex)||"\n") 510 if (!c) { // no 511 NEVERMIND() 512 continue main 513 } 514 ACCEPT() 515 if (true===c) { // open new 516 OPEN('style', token) 517 } else { // close 518 while (current != c) 519 CLOSE(true) 520 CLOSE() 521 } 522 } break; case 'TABLE_CELL': { 523 for (let c=current; ; c=c.parent) { 524 if ('table_cell'===c.type) { 525 read_args() 526 skip_spaces() 527 ACCEPT() 528 while (current!==c) 529 CLOSE(true) 530 CLOSE() // cell 531 // TODO: HACK 532 if (/^ *[|][|]/.test(token)) { 533 let last = current.content[current.content.length-1] 534 last.args.div = true 535 } 536 // we don't know whether these are row args or cell args, 537 // so just pass the raw args directly, and parse them later. 538 OPEN('table_cell', rargs) 539 break 540 } 541 if ('style'!==c.type) { 542 // normally NEVERMIND skips one char, 543 // e.g. if we parse "abc" and that matches but gets rejected, it'll try parsing at "bc". 544 // but table cell tokens can look like this: " ||" 545 // if we skip 1 char (a space), it would try to parse a table cell again several times. 546 // so instead we skip to the end of the token because we know it's safe in this case. 547 NEVERMIND(REGEX.lastIndex) 548 continue main 549 } 550 } 551 } break; case 'TABLE_DIVIDER': { 552 //skip_spaces() 553 let tbl = get_last(current) 554 if (!tbl || 'table'!==tbl.type) { 555 NEVERMIND() 556 continue main 557 } 558 ACCEPT() 559 OPEN('table_divider') 560 } break; case 'TABLE_START': { 561 read_args() 562 skip_spaces() 563 ACCEPT() 564 let args_token = text.substring(match.index, last) 565 OPEN('table_row', args_token, false) // special OPEN call 566 OPEN('table_cell', rargs) 567 } break; case 'NEWLINE': { 568 ACCEPT() 569 NEWLINE(true) 570 body = true // to trigger start_line 571 } break; case 'HEADING': { 572 read_args() 573 read_body(true) 574 if (NO_ARGS===rargs && false===body) { 575 NEVERMIND() 576 continue main 577 } 578 ACCEPT() 579 let level = token.length 580 let args = {level} 581 let id = rargs[0] 582 args.id = id ? id.replace(/\W+/g, "-") : null 583 // todo: anchor name (and, can this be chosen automatically based on contents?) 584 OPEN('heading', args) 585 } break; case 'DIVIDER': { 586 ACCEPT() 587 BLOCK('divider') 588 } break; case 'BLOCK_END': { 589 ACCEPT() 590 if (brackets>0) { 591 while (!current.body) 592 CLOSE(true) 593 if ('invalid'===current.type) { 594 if ("\n}"==token) 595 NEWLINE(false) // false since we already closed everything 596 TEXT("}") 597 } 598 CLOSE() 599 } else { 600 // hack: 601 if ("\n}"==token) 602 NEWLINE(true) 603 TEXT("}") 604 } 605 } break; case 'NULL_ENV': { 606 body = true 607 ACCEPT() 608 OPEN('null_env') 609 current.prev = current.parent.prev 610 } break; case 'ESCAPED': { 611 ACCEPT() 612 if ("\\\n"===token) 613 NEWLINE(false) 614 else if ("\\."===token) { // \. is a no-op 615 // todo: close lists too 616 //current.content.push("") 617 //current.prev = 'block' 618 } else { 619 current.content.push(token.slice(1)) 620 current.prev = 'text' 621 } 622 } break; case 'QUOTE': { 623 read_args() 624 read_body(true) 625 if (NO_ARGS===rargs && false===body) { 626 NEVERMIND() 627 continue main 628 } 629 ACCEPT() 630 OPEN('quote', {cite: rargs[0]}) 631 } break; case 'CODE_BLOCK': { 632 let [lang, code] = read_code() 633 ACCEPT() 634 BLOCK('code', {text:code, lang}) 635 } break; case 'INLINE_CODE': { 636 ACCEPT() 637 BLOCK('icode', {text: token.replace(/^`|`$/g, "").replace(/``/g, "`")}) 638 } break; case 'EMBED': { 639 read_args() 640 ACCEPT() 641 let url = token.substring(1) // ehh better 642 let [type, args] = process_embed(url, rargs) 643 BLOCK(type, args) 644 } break; case 'LINK': { 645 read_args() 646 read_body(false) 647 ACCEPT() 648 let url = token 649 let args = {url} 650 if (body) { 651 OPEN('link', args) 652 } else { 653 args.text = rargs[0] 654 BLOCK('simple_link', args) 655 } 656 } break; case 'LIST_ITEM': { 657 read_args() 658 read_body(true) 659 if (NO_ARGS===rargs && false===body) { 660 NEVERMIND() 661 continue main 662 } 663 ACCEPT() 664 let indent = token.indexOf("-") 665 OPEN('list_item', {indent, kind:rargs[0]==="1"?"1":undefined}) 666 } } 667 668 if (body) { 669 text = text.substring(last) 670 last = REGEX.lastIndex = 0 671 prev = -1 672 } 673 } // end of main loop 674 675 TEXT(text.substring(last)) // text after last token 676 677 while ('ROOT'!==current.type) 678 CLOSE(true) 679 if ('newline'===current.prev) 680 current.content.push("\n") 681 682 current = null // my the memory leak! 683 684 return tree 685 } /* parse() */ 686 687 this.parse = parse 688 this.langs = {'12y2': parse} 689} } 690 691export default Markup_12y2 692 693// what if you want to write like, "{...}". well that's fine 694// BUT if you are inside a tag, the } will close it. 695// maybe closing tags should need some kind of special syntax? 696// \tag{ ... \} >{...\} idk.. 697// or match paired {}s : 698// \tag{ ... {heck} ... } <- closes here 699 700// todo: after parsing a block element: eat the next newline directly 701 702// idea: 703// compare ast formats: 704// memory, speed, etc. 705// {type, args, content} 706// [type, args, content] 707// [type, args, ...content]