social media crossposting tool. 3rd time's the charm
mastodon misskey crossposting bluesky
at next 151 lines 4.8 kB view raw
1from html.parser import HTMLParser 2from typing import override 3 4from cross.tokens import LinkToken, TextToken, Token 5from util.splitter import canonical_label 6 7 8class HTMLToTokensParser(HTMLParser): 9 def __init__(self) -> None: 10 super().__init__() 11 self.tokens: list[Token] = [] 12 13 self._tag_stack: dict[str, tuple[str, dict[str, str | None]]] = {} 14 self.in_pre: bool = False 15 self.in_code: bool = False 16 self.invisible: bool = False 17 18 def handle_a_endtag(self): 19 label, _attr = self._tag_stack.pop("a") 20 21 href = _attr.get("href") 22 if href: 23 if canonical_label(label, href): 24 self.tokens.append(LinkToken(href=href)) 25 else: 26 self.tokens.append(LinkToken(href=href, label=label)) 27 28 def append_text(self, text: str): 29 self.tokens.append(TextToken(text=text)) 30 31 def append_newline(self): 32 if self.tokens: 33 last_token = self.tokens[-1] 34 if isinstance(last_token, TextToken) and not last_token.text.endswith("\n"): 35 self.tokens.append(TextToken(text="\n")) 36 37 @override 38 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: 39 _attr = dict(attrs) 40 41 if self.invisible: 42 return 43 44 match tag: 45 case "p": 46 cls = _attr.get("class", "") 47 if cls and "quote-inline" in cls: 48 self.invisible = True 49 case "a": 50 self._tag_stack["a"] = ("", _attr) 51 case "code": 52 if not self.in_pre: 53 self.append_text("`") 54 self.in_code = True 55 case "pre": 56 self.append_newline() 57 self.append_text("```\n") 58 self.in_pre = True 59 case "blockquote": 60 self.append_newline() 61 self.append_text("> ") 62 case "strong" | "b": 63 self.append_text("**") 64 case "em" | "i": 65 self.append_text("*") 66 case "del" | "s": 67 self.append_text("~~") 68 case "br": 69 self.append_text("\n") 70 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6": 71 level = int(tag[1]) 72 self.append_text("\n" + "#" * level + " ") 73 case _: 74 # self.builder.extend(f"<{tag}>".encode("utf-8")) 75 pass 76 77 @override 78 def handle_endtag(self, tag: str) -> None: 79 if self.invisible: 80 if tag == "p": 81 self.invisible = False 82 return 83 84 match tag: 85 case "a": 86 if "a" in self._tag_stack: 87 self.handle_a_endtag() 88 case "code": 89 if not self.in_pre and self.in_code: 90 self.append_text("`") 91 self.in_code = False 92 case "pre": 93 self.append_newline() 94 self.append_text("```\n") 95 self.in_pre = False 96 case "blockquote": 97 self.append_text("\n") 98 case "strong" | "b": 99 self.append_text("**") 100 case "em" | "i": 101 self.append_text("*") 102 case "del" | "s": 103 self.append_text("~~") 104 case "p": 105 self.append_text("\n\n") 106 case "h1" | "h2" | "h3" | "h4" | "h5" | "h6": 107 self.append_text("\n") 108 case _: 109 # self.builder.extend(f"</{tag}>".encode("utf-8")) 110 pass 111 112 @override 113 def handle_data(self, data: str) -> None: 114 if self.invisible: 115 return 116 117 if self._tag_stack.get("a"): 118 label, _attr = self._tag_stack.pop("a") 119 self._tag_stack["a"] = (label + data, _attr) 120 else: 121 self.append_text(data) 122 123 def get_result(self) -> list[Token]: 124 if not self.tokens: 125 return [] 126 127 combined: list[Token] = [] 128 buffer: list[str] = [] 129 130 def flush_buffer(): 131 if buffer: 132 merged = "".join(buffer) 133 combined.append(TextToken(text=merged)) 134 buffer.clear() 135 136 for token in self.tokens: 137 if isinstance(token, TextToken): 138 buffer.append(token.text) 139 else: 140 flush_buffer() 141 combined.append(token) 142 143 flush_buffer() 144 145 if combined and isinstance(combined[-1], TextToken): 146 if combined[-1].text.endswith("\n\n"): 147 combined[-1] = TextToken(text=combined[-1].text[:-2]) 148 149 if combined[-1].text.endswith("\n"): 150 combined[-1] = TextToken(text=combined[-1].text[:-1]) 151 return combined