util/markdown.py at next · zenfyr.dev/xpost

zenfyr.dev / xpost
fork atom
social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
fork atom
xpost / util / markdown.py
at next 127 lines 4.2 kB view raw
wrap content
zenfyr.dev initial commit 3w ago
fc2a66c8
  1import re
  2
  3from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token
  4from util.html import HTMLToTokensParser
  5from util.splitter import canonical_label
  6
  7
  8URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
  9MD_INLINE_LINK = re.compile(
 10    r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
 11    re.IGNORECASE,
 12)
 13MD_AUTOLINK = re.compile(
 14    r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
 15)
 16HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
 17FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
 18
 19REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
 20
 21
 22# TODO autolinks are broken by the html parser
 23class MarkdownParser:
 24    def parse(
 25        self, text: str, tags: list[str], handles: list[tuple[str, str]]
 26    ) -> list[Token]:
 27        if not text:
 28            return []
 29
 30        tokenizer = HTMLToTokensParser()
 31        tokenizer.feed(text)
 32        html_tokens = tokenizer.get_result()
 33
 34        tokens: list[Token] = []
 35
 36        for tk in html_tokens:
 37            if isinstance(tk, TextToken):
 38                tokens.extend(self.__tokenize_md(tk.text, tags, handles))
 39            elif isinstance(tk, LinkToken):
 40                if not tk.label or canonical_label(tk.label, tk.href):
 41                    tokens.append(tk)
 42                    continue
 43
 44                tokens.extend(
 45                    self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)
 46                )
 47            else:
 48                tokens.append(tk)
 49
 50        return tokens
 51
 52    def __tokenize_md(
 53        self, text: str, tags: list[str], handles: list[tuple[str, str]]
 54    ) -> list[Token]:
 55        index: int = 0
 56        total: int = len(text)
 57        buffer: list[str] = []
 58
 59        tokens: list[Token] = []
 60
 61        def flush():
 62            nonlocal buffer
 63            if buffer:
 64                tokens.append(TextToken(text="".join(buffer)))
 65                buffer = []
 66
 67        while index < total:
 68            if text[index] == "[":
 69                md_inline = MD_INLINE_LINK.match(text, index)
 70                if md_inline:
 71                    flush()
 72                    label = md_inline.group(1)
 73                    href = md_inline.group(2)
 74                    tokens.append(LinkToken(href=href, label=label))
 75                    index = md_inline.end()
 76                    continue
 77
 78            if text[index] == "<":
 79                md_auto = MD_AUTOLINK.match(text, index)
 80                if md_auto:
 81                    flush()
 82                    href = md_auto.group(1)
 83                    tokens.append(LinkToken(href=href, label=None))
 84                    index = md_auto.end()
 85                    continue
 86
 87            if text[index] == "#":
 88                tag = HASHTAG.match(text, index)
 89                if tag:
 90                    tag_text = tag.group(1)
 91                    if tag_text.lower() in tags:
 92                        flush()
 93                        tokens.append(TagToken(tag=tag_text))
 94                        index = tag.end()
 95                        continue
 96
 97            if text[index] == "@":
 98                handle = FEDIVERSE_HANDLE.match(text, index)
 99                if handle:
100                    handle_text = handle.group(0)
101                    stripped_handle = handle_text.strip()
102
103                    match = next(
104                        (pair for pair in handles if stripped_handle in pair), None
105                    )
106
107                    if match:
108                        flush()
109                        tokens.append(
110                            MentionToken(username=match[1], uri=None)
111                        )  # TODO: misskey doesn’t provide a uri
112                        index = handle.end()
113                        continue
114
115            url = URL.match(text, index)
116            if url:
117                flush()
118                href = url.group(0)
119                tokens.append(LinkToken(href=href, label=None))
120                index = url.end()
121                continue
122
123            buffer.append(text[index])
124            index += 1
125
126        flush()
127        return tokens