Render original HTML text of posts bridged from the Fediverse or Wafrn #26

-356

1 changed file

Diff round #1

expand all

unified split

src

lib

strings

html-sanitizer.ts

-356

src/lib/strings/html-sanitizer.ts

··· 1 - /** 2 - * HTML sanitizer inspired by Mastodon's Sanitize::Config 3 - * Sanitizes HTML content to prevent XSS while preserving safe formatting 4 - */ 5 - 6 - const HTTP_PROTOCOLS = ['http', 'https'] 7 - 8 - const LINK_PROTOCOLS = [ 9 - 'http', 10 - 'https', 11 - 'dat', 12 - 'dweb', 13 - 'ipfs', 14 - 'ipns', 15 - 'ssb', 16 - 'gopher', 17 - 'xmpp', 18 - 'magnet', 19 - 'gemini', 20 - ] 21 - 22 - const PROTOCOL_REGEX = /^([a-z][a-z0-9.+-]*):\/\//i 23 - 24 - interface SanitizeOptions { 25 - allowOembed?: boolean 26 - } 27 - 28 - /** 29 - * Sanitizes HTML content following Mastodon's strict rules 30 - */ 31 - export function sanitizeHtml( 32 - html: string, 33 - options: SanitizeOptions = {}, 34 - ): string { 35 - if (typeof DOMParser === 'undefined') { 36 - // Fallback for environments without DOMParser 37 - return sanitizeTextOnly(html) 38 - } 39 - 40 - const parser = new DOMParser() 41 - const doc = parser.parseFromString(html, 'text/html') 42 - const body = doc.body 43 - 44 - sanitizeNode(body, options) 45 - 46 - return body.innerHTML 47 - } 48 - 49 - function sanitizeNode(node: Node, options: SanitizeOptions): void { 50 - const childNodes = Array.from(node.childNodes) 51 - 52 - for (const child of childNodes) { 53 - if (child.nodeType === Node.ELEMENT_NODE) { 54 - const element = child as HTMLElement 55 - const tagName = element.tagName.toLowerCase() 56 - 57 - // Define allowed elements 58 - const allowedElements = options.allowOembed 59 - ? [ 60 - 'p', 61 - 'br', 62 - 'span', 63 - 'a', 64 - 'del', 65 - 's', 66 - 'pre', 67 - 'blockquote', 68 - 'code', 69 - 'b', 70 - 'strong', 71 - 'u', 72 - 'i', 73 - 'em', 74 - 'ul', 75 - 'ol', 76 - 'li', 77 - 'ruby', 78 - 'rt', 79 - 'rp', 80 - 'audio', 81 - 'iframe', 82 - 'source', 83 - 'video', 84 - ] 85 - : [ 86 - 'p', 87 - 'br', 88 - 'span', 89 - 'a', 90 - 'del', 91 - 's', 92 - 'pre', 93 - 'blockquote', 94 - 'code', 95 - 'b', 96 - 'strong', 97 - 'u', 98 - 'i', 99 - 'em', 100 - 'ul', 101 - 'ol', 102 - 'li', 103 - 'ruby', 104 - 'rt', 105 - 'rp', 106 - ] 107 - 108 - // Handle unsupported elements (h1-h6) - convert to <strong> wrapped in <p> 109 - if (['h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) { 110 - const strong = element.ownerDocument!.createElement('strong') 111 - while (element.firstChild) { 112 - strong.appendChild(element.firstChild) 113 - } 114 - const p = element.ownerDocument!.createElement('p') 115 - p.appendChild(strong) 116 - element.replaceWith(p) 117 - sanitizeNode(p, options) 118 - continue 119 - } 120 - 121 - // Handle math elements - extract annotation text 122 - if (tagName === 'math') { 123 - const mathText = extractMathAnnotation(element) 124 - if (mathText) { 125 - const textNode = element.ownerDocument!.createTextNode(mathText) 126 - element.replaceWith(textNode) 127 - } else { 128 - element.remove() 129 - } 130 - continue 131 - } 132 - 133 - if (tagName === 'li') { 134 - // Keep li elements but sanitize their children 135 - sanitizeNode(element, options) 136 - continue 137 - } 138 - 139 - // Remove elements not in allowlist 140 - if (!allowedElements.includes(tagName)) { 141 - // Replace with text content 142 - const textNode = element.ownerDocument!.createTextNode( 143 - element.textContent || '', 144 - ) 145 - element.replaceWith(textNode) 146 - continue 147 - } 148 - 149 - // Sanitize attributes 150 - sanitizeAttributes(element, options) 151 - 152 - // Recursively sanitize children 153 - sanitizeNode(element, options) 154 - } 155 - } 156 - } 157 - 158 - function sanitizeAttributes( 159 - element: HTMLElement, 160 - options: SanitizeOptions, 161 - ): void { 162 - const tagName = element.tagName.toLowerCase() 163 - const allowedAttrs: Record<string, string[]> = { 164 - a: ['href', 'rel', 'class', 'translate'], 165 - span: ['class', 'translate'], 166 - ol: ['start', 'reversed'], 167 - li: ['value'], 168 - p: ['class'], 169 - } 170 - 171 - if (options.allowOembed) { 172 - allowedAttrs.audio = ['controls'] 173 - allowedAttrs.iframe = [ 174 - 'allowfullscreen', 175 - 'frameborder', 176 - 'height', 177 - 'scrolling', 178 - 'src', 179 - 'width', 180 - ] 181 - allowedAttrs.source = ['src', 'type'] 182 - allowedAttrs.video = ['controls', 'height', 'loop', 'width'] 183 - } 184 - 185 - const allowed = allowedAttrs[tagName] || [] 186 - const attrs = Array.from(element.attributes) 187 - 188 - // Remove non-allowed attributes 189 - for (const attr of attrs) { 190 - const attrName = attr.name.toLowerCase() 191 - const isAllowed = allowed.some(a => { 192 - if (a.endsWith('*')) { 193 - return attrName.startsWith(a.slice(0, -1)) 194 - } 195 - return a === attrName 196 - }) 197 - 198 - if (!isAllowed) { 199 - element.removeAttribute(attr.name) 200 - } 201 - } 202 - 203 - // Process specific attributes 204 - if (tagName === 'a') { 205 - processAnchorElement(element) 206 - } 207 - 208 - // Process class whitelist 209 - if (element.hasAttribute('class')) { 210 - processClassWhitelist(element) 211 - } 212 - 213 - // Process translate attribute - remove unless it's "no" 214 - if (element.hasAttribute('translate')) { 215 - const translate = element.getAttribute('translate') 216 - if (translate !== 'no') { 217 - element.removeAttribute('translate') 218 - } 219 - } 220 - 221 - // Validate protocols for elements with src/href 222 - if (element.hasAttribute('href') || element.hasAttribute('src')) { 223 - validateProtocols(element, options) 224 - } 225 - } 226 - 227 - function processAnchorElement(element: HTMLElement): void { 228 - // Add required attributes 229 - element.setAttribute('rel', 'nofollow noopener') 230 - element.setAttribute('target', '_blank') 231 - 232 - // Check if href has unsupported protocol 233 - const href = element.getAttribute('href') 234 - if (href) { 235 - const scheme = getScheme(href) 236 - if (scheme !== null && scheme !== 'relative' && !LINK_PROTOCOLS.includes(scheme)) { 237 - // Replace element with its text content 238 - const textNode = element.ownerDocument!.createTextNode( 239 - element.textContent || '', 240 - ) 241 - element.replaceWith(textNode) 242 - } 243 - } 244 - } 245 - 246 - function processClassWhitelist(element: HTMLElement): void { 247 - const classList = element.className.split(/[\t\n\f\r ]+/).filter(Boolean) 248 - const whitelisted = classList.filter(className => { 249 - // microformats classes 250 - if (/^[hpuedt]-/.test(className)) return true 251 - // semantic classes 252 - if (/^(mention|hashtag)$/.test(className)) return true 253 - // link formatting classes 254 - if (/^(ellipsis|invisible)$/.test(className)) return true 255 - // quote inline class 256 - if (className === 'quote-inline') return true 257 - return false 258 - }) 259 - 260 - if (whitelisted.length > 0) { 261 - element.className = whitelisted.join(' ') 262 - } else { 263 - element.removeAttribute('class') 264 - } 265 - } 266 - 267 - function validateProtocols( 268 - element: HTMLElement, 269 - options: SanitizeOptions, 270 - ): void { 271 - const tagName = element.tagName.toLowerCase() 272 - const src = element.getAttribute('src') 273 - const href = element.getAttribute('href') 274 - const url = src || href 275 - 276 - if (!url) return 277 - 278 - const scheme = getScheme(url) 279 - 280 - // For oembed elements, only allow HTTP protocols for src 281 - if ( 282 - options.allowOembed && 283 - src && 284 - ['iframe', 'source'].includes(tagName) 285 - ) { 286 - if (scheme !== null && !HTTP_PROTOCOLS.includes(scheme)) { 287 - element.removeAttribute('src') 288 - } 289 - // Add sandbox attribute to iframes 290 - if (tagName === 'iframe') { 291 - element.setAttribute( 292 - 'sandbox', 293 - 'allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox allow-forms', 294 - ) 295 - } 296 - } 297 - } 298 - 299 - function getScheme(url: string): string | null { 300 - const match = url.match(PROTOCOL_REGEX) 301 - if (match) { 302 - return match[1].toLowerCase() 303 - } 304 - // Check if it's a relative URL 305 - if (url.startsWith('/') || url.startsWith('.')) { 306 - return 'relative' 307 - } 308 - return null 309 - } 310 - 311 - /** 312 - * Extract math annotation from MathML element 313 - * Follows FEP-dc88 spec for math element representation 314 - */ 315 - function extractMathAnnotation(mathElement: HTMLElement): string | null { 316 - const semantics = Array.from(mathElement.children).find( 317 - child => child.tagName.toLowerCase() === 'semantics', 318 - ) as HTMLElement | undefined 319 - 320 - if (!semantics) return null 321 - 322 - // Look for LaTeX annotation (application/x-tex) 323 - const latexAnnotation = Array.from(semantics.children).find(child => { 324 - return ( 325 - child.tagName.toLowerCase() === 'annotation' && 326 - child.getAttribute('encoding') === 'application/x-tex' 327 - ) 328 - }) 329 - 330 - if (latexAnnotation) { 331 - const display = mathElement.getAttribute('display') 332 - const text = latexAnnotation.textContent || '' 333 - return display === 'block' ? `$$${text}$$` : `$${text}$` 334 - } 335 - 336 - // Look for plain text annotation 337 - const plainAnnotation = Array.from(semantics.children).find(child => { 338 - return ( 339 - child.tagName.toLowerCase() === 'annotation' && 340 - child.getAttribute('encoding') === 'text/plain' 341 - ) 342 - }) 343 - 344 - if (plainAnnotation) { 345 - return plainAnnotation.textContent || null 346 - } 347 - 348 - return null 349 - } 350 - 351 - /** 352 - * Fallback sanitizer that strips all HTML tags 353 - */ 354 - function sanitizeTextOnly(html: string): string { 355 - return html.replace(/<[^>]*>/g, '') 356 - }

History

2 rounds 5 comments

maxine.puppykitty.racing submitted #1 3mo

interdiff

5 commits

expand

e7e78fad

fix: don't duplicate work in MastodonHtmlContent

3e5262ab

chore: remove any casts

265f3ab4

chore: replace unicode ellipsis with escaped version

eff00beb

feat/MastodonHtml: render as ordered lists (with numeric prefixes)

a28c6d3f

feat/MastodonHtml: collapse posts taller than 150px

expand 5 comments

daniela.lol 3mo

i am like 99% sure this would be considered a license violation if merged as mastodon is licensed under AGPL while witchsky is MIT

maxine.puppykitty.racing 3mo

Good point, I will rewrite the sanitizer from scratch

ewancroft.uk 7w

Hey Maxine! Did you get this done? I’d like to see if we can merge it once the conflicts are resolved.

maxine.puppykitty.racing 6w

Sorry ewan, haven't had the time, also this PR has some weird bugs (sometimes the render crashes and I never diagnosed it), you might want to close this one for the meanwhile

lemmaeof.gay 6w

I might look into writing a non-vibe-coded version of this at some point, it'd be a fun way to cut my teeth on webdev again

closed without merging

maxine.puppykitty.racing submitted #0 3mo

diff

2 commits

expand

6e85dcd3

feat: render full post contents for posts bridged from mastodon or wafrn

e7e78fad

fix: don't duplicate work in MastodonHtmlContent

expand 0 comments