A JavaScript lexer and syntax highlighter for Gleam!

Report errors for unknown characters and unterminated literals

+178 -81
+150 -80
src/just.gleam
··· 9 9 ignore_whitespace: Bool, 10 10 strict_mode: Bool, 11 11 mode: LexerMode, 12 + errors: List(Error), 12 13 ) 13 14 } 14 15 ··· 17 18 TreatSlashAsDivision 18 19 } 19 20 21 + pub type Error { 22 + UnknownCharacter(character: String) 23 + UnterminatedString 24 + UnterminatedComment 25 + UnterminatedRegularExpression 26 + UnterminatedTemplate 27 + } 28 + 20 29 pub fn new(source: String) -> Lexer { 21 30 Lexer( 22 31 source:, ··· 24 33 ignore_whitespace: False, 25 34 strict_mode: False, 26 35 mode: TreatSlashAsRegex, 36 + errors: [], 27 37 ) 28 38 } 29 39 ··· 39 49 Lexer(..lexer, strict_mode: True) 40 50 } 41 51 42 - pub fn tokenise(lexer: Lexer) -> List(Token) { 52 + pub fn tokenise(lexer: Lexer) -> #(List(Token), List(Error)) { 43 53 let #(lexer, tokens) = maybe_lex_hashbang_comment(lexer) 44 54 do_tokenise(lexer, tokens) 45 55 } 46 56 47 - fn update_mode_with_token(lexer: Lexer, token: Token) -> Lexer { 48 - let mode = case token { 49 - // Comments and whitespace don't affect lexing mode 50 - token.SingleLineComment(_) 51 - | token.MultiLineComment(_) 52 - | token.HashBangComment(_) 53 - | token.Whitespace(_) 54 - | token.LineTerminator(_) 55 - | token.EndOfFile -> lexer.mode 56 - 57 - // Values make us look for division 58 - token.Identifier(_) 59 - | token.PrivateIdentifier(_) 60 - | token.Number(_) 61 - | token.BigInt(_) 62 - | token.String(..) 63 - | token.RegularExpression(_) 64 - | token.TemplateTail(_) -> TreatSlashAsDivision 65 - 66 - // These keywords act as values, so we look for division after them 67 - token.False | token.Null | token.This | token.True -> TreatSlashAsDivision 68 - 69 - // After a grouping we look for division 70 - token.RightParen | token.RightSquare -> TreatSlashAsDivision 71 - 72 - // These can be either postfix or prefix. Either way, we keep the lexing mode the same. 73 - token.DoublePlus | token.DoubleMinus -> lexer.mode 74 - 75 - // In any other case, we look for a regular expression next. 76 - _ -> TreatSlashAsRegex 77 - } 78 - 79 - Lexer(..lexer, mode:) 80 - } 81 - 82 57 fn maybe_lex_hashbang_comment(lexer: Lexer) -> #(Lexer, List(Token)) { 83 58 case lexer.source { 84 59 "#!" <> source -> { ··· 96 71 } 97 72 } 98 73 99 - fn do_tokenise(lexer: Lexer, tokens: List(Token)) -> List(Token) { 74 + fn do_tokenise(lexer: Lexer, tokens: List(Token)) -> #(List(Token), List(Error)) { 100 75 case next(lexer) { 101 - #(_, token.EndOfFile) -> list.reverse([token.EndOfFile, ..tokens]) 76 + #(lexer, token.EndOfFile) -> #( 77 + list.reverse([token.EndOfFile, ..tokens]), 78 + list.reverse(lexer.errors), 79 + ) 102 80 #(lexer, token.TemplateHead(_) as token) -> { 103 81 let #(lexer, tokens) = 104 82 lex_template_parts( ··· 113 91 } 114 92 } 115 93 94 + fn maybe_token(lexer: Lexer, token: Token, condition: Bool) -> #(Lexer, Token) { 95 + case condition { 96 + True -> #(lexer, token) 97 + False -> next(lexer) 98 + } 99 + } 100 + 116 101 fn next(lexer: Lexer) -> #(Lexer, Token) { 117 102 case lexer.source { 118 103 "" -> #(lexer, token.EndOfFile) ··· 143 128 "\u{000A}" as space <> source 144 129 | "\u{000D}" as space <> source 145 130 | "\u{2028}" as space <> source 146 - | "\u{2029}" as space <> source -> { 147 - let lexer = advance(lexer, source) 148 - case lexer.ignore_whitespace { 149 - True -> next(lexer) 150 - False -> #(lexer, token.LineTerminator(space)) 151 - } 152 - } 131 + | "\u{2029}" as space <> source -> 132 + maybe_token( 133 + advance(lexer, source), 134 + token.LineTerminator(space), 135 + !lexer.ignore_whitespace, 136 + ) 153 137 154 138 "//" <> source -> { 155 139 let #(lexer, contents) = lex_until_end_of_line(advance(lexer, source), "") 156 - case lexer.ignore_comments { 157 - True -> next(lexer) 158 - False -> #(lexer, token.SingleLineComment(contents)) 159 - } 140 + maybe_token( 141 + lexer, 142 + token.SingleLineComment(contents), 143 + !lexer.ignore_comments, 144 + ) 160 145 } 161 - "/*" <> source -> { 162 - let #(lexer, contents) = lex_multiline_comment(advance(lexer, source), "") 163 - case lexer.ignore_comments { 164 - True -> next(lexer) 165 - False -> #(lexer, token.MultiLineComment(contents)) 166 - } 167 - } 146 + "/*" <> source -> lex_multiline_comment(advance(lexer, source), "") 168 147 169 148 "0b" as prefix <> source -> 170 149 lex_radix_number(advance(lexer, source), 2, prefix, False) ··· 209 188 | ".0" as digit <> source -> 210 189 lex_number(advance(lexer, source), digit, Decimal, AfterNumber) 211 190 212 - "/" <> source if lexer.mode == TreatSlashAsRegex -> { 213 - let #(lexer, value) = lex_regex(advance(lexer, source), "", False) 214 - #(lexer, token.RegularExpression(value)) 215 - } 191 + "/" <> source if lexer.mode == TreatSlashAsRegex -> 192 + lex_regex(advance(lexer, source), "", False) 216 193 217 194 "{" <> source -> #(advance(lexer, source), token.LeftBrace) 218 195 "}" <> source -> #(advance(lexer, source), token.RightBrace) ··· 344 321 #(lexer, token) 345 322 } 346 323 347 - "'" as quote <> source | "\"" as quote <> source -> { 348 - let #(lexer, string) = lex_string(advance(lexer, source), quote, "") 349 - #(lexer, token.String(quote, string)) 350 - } 324 + "'" as quote <> source | "\"" as quote <> source -> 325 + lex_string(advance(lexer, source), quote, "") 351 326 352 327 "`" <> source -> lex_template_head(advance(lexer, source), "") 353 328 354 - _ -> #(lexer, token.EndOfFile) 329 + _ -> 330 + case string.pop_grapheme(lexer.source) { 331 + Error(_) -> #(lexer, token.EndOfFile) 332 + Ok(#(character, source)) -> #( 333 + lexer |> advance(source) |> error(UnknownCharacter(character)), 334 + token.Unknown(character), 335 + ) 336 + } 355 337 } 356 338 } 357 339 358 - fn lex_multiline_comment(lexer: Lexer, lexed: String) -> #(Lexer, String) { 340 + fn lex_multiline_comment(lexer: Lexer, lexed: String) -> #(Lexer, Token) { 359 341 case lexer.source { 360 - "*/" <> source -> #(advance(lexer, source), lexed) 342 + "*/" <> source -> 343 + maybe_token( 344 + advance(lexer, source), 345 + token.MultiLineComment(lexed), 346 + !lexer.ignore_comments, 347 + ) 361 348 _ -> 362 349 case string.pop_grapheme(lexer.source) { 363 - Error(_) -> #(lexer, lexed) 350 + Error(_) -> 351 + maybe_token( 352 + error(lexer, UnterminatedComment), 353 + token.MultiLineComment(lexed), 354 + !lexer.ignore_comments, 355 + ) 364 356 Ok(#(char, source)) -> 365 357 lex_multiline_comment(advance(lexer, source), lexed <> char) 366 358 } ··· 574 566 } 575 567 } 576 568 577 - fn lex_string(lexer: Lexer, quote: String, contents: String) -> #(Lexer, String) { 569 + fn lex_string(lexer: Lexer, quote: String, contents: String) -> #(Lexer, Token) { 578 570 case string.pop_grapheme(lexer.source) { 579 - Error(_) -> #(lexer, contents) 571 + Error(_) -> #( 572 + error(lexer, UnterminatedString), 573 + token.UnterminatedString(quote:, contents:), 574 + ) 575 + Ok(#("\n", _source)) | Ok(#("\r", _source)) -> #( 576 + error(lexer, UnterminatedString), 577 + token.UnterminatedString(quote:, contents:), 578 + ) 579 + 580 580 Ok(#(character, source)) if character == quote -> #( 581 581 advance(lexer, source), 582 - contents, 582 + token.String(quote:, contents:), 583 583 ) 584 584 Ok(#("\\", source)) -> 585 585 case string.pop_grapheme(source) { 586 - Error(_) -> #(lexer, contents) 586 + Error(_) -> #( 587 + error(lexer, UnterminatedString), 588 + token.UnterminatedString(quote:, contents:), 589 + ) 587 590 Ok(#(character, source)) -> 588 591 lex_string( 589 592 advance(lexer, source), ··· 602 605 "`" <> source -> #(advance(lexer, source), token.String("`", lexed)) 603 606 "\\" <> source -> 604 607 case string.pop_grapheme(source) { 605 - Error(_) -> #(lexer, token.String("`", lexed)) 608 + Error(_) -> #( 609 + error(lexer, UnterminatedString), 610 + token.UnterminatedString("`", lexed), 611 + ) 606 612 Ok(#(character, source)) -> 607 613 lex_template_head(advance(lexer, source), lexed <> "\\" <> character) 608 614 } 609 615 _ -> 610 616 case string.pop_grapheme(lexer.source) { 611 - Error(_) -> #(lexer, token.String("`", lexed)) 617 + Error(_) -> #( 618 + error(lexer, UnterminatedString), 619 + token.UnterminatedString("`", lexed), 620 + ) 612 621 Ok(#(character, source)) -> 613 622 lex_template_head(advance(lexer, source), lexed <> character) 614 623 } ··· 643 652 } 644 653 "\\" <> source -> 645 654 case string.pop_grapheme(source) { 646 - Error(_) -> #(lexer, [token.TemplateTail(lexed), ..tokens]) 655 + Error(_) -> #(error(lexer, UnterminatedTemplate), [ 656 + token.UnterminatedTemplate(lexed), 657 + ..tokens 658 + ]) 647 659 Ok(#(character, source)) -> 648 660 lex_template_parts( 649 661 advance(lexer, source), ··· 653 665 } 654 666 _ -> 655 667 case string.pop_grapheme(lexer.source) { 656 - Error(_) -> #(lexer, [token.TemplateTail(lexed), ..tokens]) 668 + Error(_) -> #(error(lexer, UnterminatedTemplate), [ 669 + token.UnterminatedTemplate(lexed), 670 + ..tokens 671 + ]) 657 672 Ok(#(character, source)) -> 658 673 lex_template_parts( 659 674 advance(lexer, source), ··· 690 705 } 691 706 } 692 707 693 - fn lex_regex(lexer: Lexer, lexed: String, in_group: Bool) -> #(Lexer, String) { 708 + fn lex_regex(lexer: Lexer, lexed: String, in_group: Bool) -> #(Lexer, Token) { 694 709 case lexer.source { 695 - "/" <> source if !in_group -> #(advance(lexer, source), lexed) 710 + "/" <> source if !in_group -> #( 711 + advance(lexer, source), 712 + token.RegularExpression(lexed), 713 + ) 696 714 "[" <> source -> lex_regex(advance(lexer, source), lexed <> "[", True) 697 715 "]" <> source -> lex_regex(advance(lexer, source), lexed <> "]", False) 716 + "\n" <> _source 717 + | "\r" <> _source 718 + | "\u{2028}" <> _source 719 + | "\u{2029}" <> _source -> #( 720 + error(lexer, UnterminatedRegularExpression), 721 + token.UnterminatedRegularExpression(lexed), 722 + ) 698 723 "\\" <> source -> 699 724 case string.pop_grapheme(source) { 700 - Error(_) -> #(lexer, lexed) 725 + Error(_) -> #( 726 + error(lexer, UnterminatedRegularExpression), 727 + token.UnterminatedRegularExpression(lexed), 728 + ) 701 729 Ok(#(character, source)) -> 702 730 lex_regex( 703 731 advance(lexer, source), ··· 707 735 } 708 736 _ -> 709 737 case string.pop_grapheme(lexer.source) { 710 - Error(_) -> #(lexer, lexed) 738 + Error(_) -> #( 739 + error(lexer, UnterminatedRegularExpression), 740 + token.UnterminatedRegularExpression(lexed), 741 + ) 711 742 Ok(#(character, source)) -> 712 743 lex_regex(advance(lexer, source), lexed <> character, in_group) 713 744 } ··· 836 867 fn advance(lexer: Lexer, source: String) -> Lexer { 837 868 Lexer(..lexer, source:) 838 869 } 870 + 871 + fn update_mode_with_token(lexer: Lexer, token: Token) -> Lexer { 872 + let mode = case token { 873 + // Comments and whitespace don't affect lexing mode 874 + token.SingleLineComment(_) 875 + | token.MultiLineComment(_) 876 + | token.HashBangComment(_) 877 + | token.Whitespace(_) 878 + | token.LineTerminator(_) 879 + | token.EndOfFile -> lexer.mode 880 + 881 + // Values make us look for division 882 + token.Identifier(_) 883 + | token.PrivateIdentifier(_) 884 + | token.Number(_) 885 + | token.BigInt(_) 886 + | token.String(..) 887 + | token.RegularExpression(_) 888 + | token.TemplateTail(_) -> TreatSlashAsDivision 889 + 890 + // These keywords act as values, so we look for division after them 891 + token.False | token.Null | token.This | token.True -> TreatSlashAsDivision 892 + 893 + // After a grouping we look for division 894 + token.RightParen | token.RightSquare -> TreatSlashAsDivision 895 + 896 + // These can be either postfix or prefix. Either way, we keep the lexing mode the same. 897 + token.DoublePlus | token.DoubleMinus -> lexer.mode 898 + 899 + // In any other case, we look for a regular expression next. 900 + _ -> TreatSlashAsRegex 901 + } 902 + 903 + Lexer(..lexer, mode:) 904 + } 905 + 906 + fn error(lexer: Lexer, error: Error) -> Lexer { 907 + Lexer(..lexer, errors: [error, ..lexer.errors]) 908 + }
+15 -1
src/just/highlight.gleam
··· 22 22 Operator(String) 23 23 Comment(String) 24 24 Punctuation(String) 25 + Other(String) 25 26 } 26 27 27 28 /// Convert a string of JavaScript source code into ansi highlighting. ··· 55 56 Operator(s) -> ansi.magenta(s) 56 57 Comment(s) -> ansi.italic(ansi.gray(s)) 57 58 Punctuation(s) -> ansi.reset(s) 59 + Other(s) -> ansi.reset(s) 58 60 } 59 61 }) 60 62 } ··· 119 121 acc <> "<span class=hl-comment>" <> houdini.escape(s) <> "</span>" 120 122 Punctuation(s) -> 121 123 acc <> "<span class=hl-punctuation>" <> houdini.escape(s) <> "</span>" 124 + Other(s) -> acc <> s 122 125 } 123 126 }) 124 127 } ··· 131 134 /// 132 135 pub fn to_tokens(code: String) -> List(Token) { 133 136 let lexer = just.new(code) 134 - do_to_tokens(just.tokenise(lexer), []) 137 + let #(tokens, _errors) = just.tokenise(lexer) 138 + do_to_tokens(tokens, []) 135 139 } 136 140 137 141 fn do_to_tokens(in: List(t.Token), out: List(Token)) -> List(Token) { ··· 335 339 [t.DoubleAmpersandEqual, ..in] -> do_to_tokens(in, [Operator("&&="), ..out]) 336 340 [t.DoublePipeEqual, ..in] -> do_to_tokens(in, [Operator("||="), ..out]) 337 341 [t.DoubleQuestionEqual, ..in] -> do_to_tokens(in, [Operator("??="), ..out]) 342 + 343 + [t.Unknown(value), ..] -> do_to_tokens(in, [Other(value), ..out]) 344 + [t.UnterminatedComment(value), ..] -> 345 + do_to_tokens(in, [Comment("/*" <> value), ..out]) 346 + [t.UnterminatedRegularExpression(value), ..] -> 347 + do_to_tokens(in, [Regexp("/" <> value), ..out]) 348 + [t.UnterminatedString(quote:, contents:), ..] -> 349 + do_to_tokens(in, [String(quote <> contents), ..out]) 350 + [t.UnterminatedTemplate(contents), ..] -> 351 + do_to_tokens(in, [String(contents), ..out]) 338 352 } 339 353 }
+13
src/just/token.gleam
··· 141 141 DoubleAmpersandEqual 142 142 DoublePipeEqual 143 143 DoubleQuestionEqual 144 + 145 + // Invalid tokens 146 + Unknown(String) 147 + UnterminatedString(quote: String, contents: String) 148 + UnterminatedTemplate(String) 149 + UnterminatedRegularExpression(String) 150 + UnterminatedComment(String) 144 151 } 145 152 146 153 pub type ContextualKeyword { ··· 315 322 DoubleAmpersandEqual -> "&&=" 316 323 DoublePipeEqual -> "||=" 317 324 DoubleQuestionEqual -> "??=" 325 + 326 + Unknown(value) -> value 327 + UnterminatedComment(value) -> "/*" <> value 328 + UnterminatedRegularExpression(value) -> "/" <> value 329 + UnterminatedString(quote:, contents:) -> quote <> contents 330 + UnterminatedTemplate(value) -> value 318 331 } 319 332 }