social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
at next 127 lines 4.2 kB view raw
1import re 2 3from cross.tokens import LinkToken, MentionToken, TagToken, TextToken, Token 4from util.html import HTMLToTokensParser 5from util.splitter import canonical_label 6 7 8URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE) 9MD_INLINE_LINK = re.compile( 10 r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)", 11 re.IGNORECASE, 12) 13MD_AUTOLINK = re.compile( 14 r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE 15) 16HASHTAG = re.compile(r"(?<!\w)\#([\w]+)") 17FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?") 18 19REGEXES = [URL, MD_INLINE_LINK, MD_AUTOLINK, HASHTAG, FEDIVERSE_HANDLE] 20 21 22# TODO autolinks are broken by the html parser 23class MarkdownParser: 24 def parse( 25 self, text: str, tags: list[str], handles: list[tuple[str, str]] 26 ) -> list[Token]: 27 if not text: 28 return [] 29 30 tokenizer = HTMLToTokensParser() 31 tokenizer.feed(text) 32 html_tokens = tokenizer.get_result() 33 34 tokens: list[Token] = [] 35 36 for tk in html_tokens: 37 if isinstance(tk, TextToken): 38 tokens.extend(self.__tokenize_md(tk.text, tags, handles)) 39 elif isinstance(tk, LinkToken): 40 if not tk.label or canonical_label(tk.label, tk.href): 41 tokens.append(tk) 42 continue 43 44 tokens.extend( 45 self.__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles) 46 ) 47 else: 48 tokens.append(tk) 49 50 return tokens 51 52 def __tokenize_md( 53 self, text: str, tags: list[str], handles: list[tuple[str, str]] 54 ) -> list[Token]: 55 index: int = 0 56 total: int = len(text) 57 buffer: list[str] = [] 58 59 tokens: list[Token] = [] 60 61 def flush(): 62 nonlocal buffer 63 if buffer: 64 tokens.append(TextToken(text="".join(buffer))) 65 buffer = [] 66 67 while index < total: 68 if text[index] == "[": 69 md_inline = MD_INLINE_LINK.match(text, index) 70 if md_inline: 71 flush() 72 label = md_inline.group(1) 73 href = md_inline.group(2) 74 tokens.append(LinkToken(href=href, label=label)) 75 index = md_inline.end() 76 continue 77 78 if text[index] == "<": 79 md_auto = MD_AUTOLINK.match(text, index) 80 if md_auto: 81 flush() 82 href = md_auto.group(1) 83 tokens.append(LinkToken(href=href, label=None)) 84 index = md_auto.end() 85 continue 86 87 if text[index] == "#": 88 tag = HASHTAG.match(text, index) 89 if tag: 90 tag_text = tag.group(1) 91 if tag_text.lower() in tags: 92 flush() 93 tokens.append(TagToken(tag=tag_text)) 94 index = tag.end() 95 continue 96 97 if text[index] == "@": 98 handle = FEDIVERSE_HANDLE.match(text, index) 99 if handle: 100 handle_text = handle.group(0) 101 stripped_handle = handle_text.strip() 102 103 match = next( 104 (pair for pair in handles if stripped_handle in pair), None 105 ) 106 107 if match: 108 flush() 109 tokens.append( 110 MentionToken(username=match[1], uri=None) 111 ) # TODO: misskey doesn’t provide a uri 112 index = handle.end() 113 continue 114 115 url = URL.match(text, index) 116 if url: 117 flush() 118 href = url.group(0) 119 tokens.append(LinkToken(href=href, label=None)) 120 index = url.end() 121 continue 122 123 buffer.append(text[index]) 124 index += 1 125 126 flush() 127 return tokens