social media crossposting tool. 3rd time's the charm
mastodon
misskey
crossposting
bluesky
1import re
2
3from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token
4from util.html import HTMLToTokensParser
5from util.splitter import canonical_label
6
7
8URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
9MD_INLINE_LINK = re.compile(
10 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
11 re.IGNORECASE,
12)
13MD_AUTOLINK = re.compile(
14 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
15)
16HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
17FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")
18
19REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE]
20
21
22# TODO autolinks are broken by the html parser
23class MarkdownParser:
24 def parse(
25 self, text: str, tags: list[str], handles: list[tuple[str, str]]
26 ) -> list[Token]:
27 if not text:
28 return []
29
30 tokenizer = HTMLToTokensParser()
31 tokenizer.feed(text)
32 html_tokens = tokenizer.get_result()
33
34 tokens: list[Token] = []
35
36 for tk in html_tokens:
37 if isinstance(tk, TextToken):
38 tokens.extend(self.__tokenize_md(tk.text, tags, handles))
39 elif isinstance(tk, LinkToken):
40 if not tk.label or canonical_label(tk.label, tk.href):
41 tokens.append(tk)
42 continue
43
44 tokens.extend(
45 self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles)
46 )
47 else:
48 tokens.append(tk)
49
50 return tokens
51
52 def __tokenize_md(
53 self, text: str, tags: list[str], handles: list[tuple[str, str]]
54 ) -> list[Token]:
55 index: int = 0
56 total: int = len(text)
57 buffer: list[str] = []
58
59 tokens: list[Token] = []
60
61 def flush():
62 nonlocal buffer
63 if buffer:
64 tokens.append(TextToken(text="".join(buffer)))
65 buffer = []
66
67 while index < total:
68 if text[index] == "[":
69 md_inline = MD_INLINE_LINK.match(text, index)
70 if md_inline:
71 flush()
72 label = md_inline.group(1)
73 href = md_inline.group(2)
74 tokens.append(LinkToken(href=href, label=label))
75 index = md_inline.end()
76 continue
77
78 if text[index] == "<":
79 md_auto = MD_AUTOLINK.match(text, index)
80 if md_auto:
81 flush()
82 href = md_auto.group(1)
83 tokens.append(LinkToken(href=href, label=None))
84 index = md_auto.end()
85 continue
86
87 if text[index] == "#":
88 tag = HASHTAG.match(text, index)
89 if tag:
90 tag_text = tag.group(1)
91 if tag_text.lower() in tags:
92 flush()
93 tokens.append(TagToken(tag=tag_text))
94 index = tag.end()
95 continue
96
97 if text[index] == "@":
98 handle = FEDIVERSE_HANDLE.match(text, index)
99 if handle:
100 handle_text = handle.group(0)
101 stripped_handle = handle_text.strip()
102
103 match = next(
104 (pair for pair in handles if stripped_handle in pair), None
105 )
106
107 if match:
108 flush()
109 tokens.append(
110 MentionToken(username=match[1], uri=None)
111 ) # TODO: misskey doesn’t provide a uri
112 index = handle.end()
113 continue
114
115 url = URL.match(text, index)
116 if url:
117 flush()
118 href = url.group(0)
119 tokens.append(LinkToken(href=href, label=None))
120 index = url.end()
121 continue
122
123 buffer.append(text[index])
124 index += 1
125
126 flush()
127 return tokens