Barazo AppView backend barazo.forum

feat(firehose): sanitize user-generated content at indexing time (#62)

* feat(firehose): sanitize user-generated content at indexing time

Content from PDS records was stored unsanitized in PostgreSQL. Add
DOMPurify sanitization in topic and reply indexers so all stored
content is clean regardless of which client reads it.

- sanitizeHtml: allows safe markdown tags, strips scripts/iframes/forms
- sanitizeText: strips all HTML (used for topic titles)
- Both apply Unicode NFC normalization and strip bidi override chars
- 32 new tests (26 sanitize module + 6 indexer integration)

* fix(ci): remove lockfile that should not be tracked

This repo intentionally has no lockfile (removed in 1dd1cbc). The
workspace root manages dependencies. CI generates a fresh lockfile.

authored by

Guido X Jansen and committed by
GitHub
c33161eb 545b5504

+498 -8
+1
package.json
··· 48 48 "drizzle-orm": "^0.45.1", 49 49 "fastify": "^5.7.4", 50 50 "ioredis": "^5.6.1", 51 + "isomorphic-dompurify": "^2.36.0", 51 52 "multiformats": "^13.4.2", 52 53 "postgres": "^3.4.8", 53 54 "sharp": "^0.34.5",
+3 -2
src/firehose/indexers/reply.ts
··· 5 5 import type { Logger } from '../../lib/logger.js' 6 6 import type { TrustStatus } from '../../services/account-age.js' 7 7 import { clampCreatedAt } from '../clamp-timestamp.js' 8 + import { sanitizeHtml } from '../../lib/sanitize.js' 8 9 9 10 interface CreateParams { 10 11 uri: string ··· 54 55 uri, 55 56 rkey, 56 57 authorDid: did, 57 - content: record['content'] as string, 58 + content: sanitizeHtml(record['content'] as string), 58 59 contentFormat: (record['contentFormat'] as string | undefined) ?? null, 59 60 rootUri: root.uri, 60 61 rootCid: root.cid, ··· 87 88 await this.db 88 89 .update(replies) 89 90 .set({ 90 - content: record['content'] as string, 91 + content: sanitizeHtml(record['content'] as string), 91 92 contentFormat: (record['contentFormat'] as string | undefined) ?? null, 92 93 cid, 93 94 labels: (record['labels'] as { values: { val: string }[] } | undefined) ?? null,
+7 -6
src/firehose/indexers/topic.ts
··· 4 4 import type { Logger } from '../../lib/logger.js' 5 5 import type { TrustStatus } from '../../services/account-age.js' 6 6 import { clampCreatedAt } from '../clamp-timestamp.js' 7 + import { sanitizeHtml, sanitizeText } from '../../lib/sanitize.js' 7 8 8 9 interface CreateParams { 9 10 uri: string ··· 38 39 uri, 39 40 rkey, 40 41 authorDid: did, 41 - title: record['title'] as string, 42 - content: record['content'] as string, 42 + title: sanitizeText(record['title'] as string), 43 + content: sanitizeHtml(record['content'] as string), 43 44 contentFormat: (record['contentFormat'] as string | undefined) ?? null, 44 45 category: record['category'] as string, 45 46 tags: (record['tags'] as string[] | undefined) ?? null, ··· 53 54 .onConflictDoUpdate({ 54 55 target: topics.uri, 55 56 set: { 56 - title: record['title'] as string, 57 - content: record['content'] as string, 57 + title: sanitizeText(record['title'] as string), 58 + content: sanitizeHtml(record['content'] as string), 58 59 contentFormat: (record['contentFormat'] as string | undefined) ?? null, 59 60 category: record['category'] as string, 60 61 tags: (record['tags'] as string[] | undefined) ?? null, ··· 73 74 await this.db 74 75 .update(topics) 75 76 .set({ 76 - title: record['title'] as string, 77 - content: record['content'] as string, 77 + title: sanitizeText(record['title'] as string), 78 + content: sanitizeHtml(record['content'] as string), 78 79 contentFormat: (record['contentFormat'] as string | undefined) ?? null, 79 80 category: record['category'] as string, 80 81 tags: (record['tags'] as string[] | undefined) ?? null,
+83
src/lib/sanitize.ts
··· 1 + import DOMPurify from 'isomorphic-dompurify' 2 + 3 + /** 4 + * Bidirectional override and mark characters to strip from all text. 5 + * Prevents text reordering attacks (bidi override) and invisible direction marks. 6 + */ 7 + const BIDI_REGEX = /[\u202A-\u202E\u2066-\u2069\u200E\u200F]/g 8 + 9 + /** Tags allowed in forum content (markdown-rendered HTML). */ 10 + const ALLOWED_TAGS = [ 11 + 'p', 12 + 'br', 13 + 'strong', 14 + 'em', 15 + 'a', 16 + 'code', 17 + 'pre', 18 + 'blockquote', 19 + 'ul', 20 + 'ol', 21 + 'li', 22 + 'h1', 23 + 'h2', 24 + 'h3', 25 + 'h4', 26 + 'h5', 27 + 'h6', 28 + 'hr', 29 + 'img', 30 + 'table', 31 + 'thead', 32 + 'tbody', 33 + 'tr', 34 + 'th', 35 + 'td', 36 + 'del', 37 + 'sup', 38 + 'sub', 39 + 'span', 40 + ] 41 + 42 + /** Attributes allowed on permitted tags. */ 43 + const ALLOWED_ATTR = ['href', 'src', 'alt', 'title', 'class', 'rel', 'target'] 44 + 45 + /** 46 + * Apply Unicode NFC normalization and strip bidirectional override characters. 47 + */ 48 + function normalizeText(input: string): string { 49 + return input.normalize('NFC').replace(BIDI_REGEX, '') 50 + } 51 + 52 + /** 53 + * Sanitize HTML content for storage. Allows safe markdown-rendered tags. 54 + * Applies NFC normalization and strips bidi override characters. 55 + * 56 + * Use for topic content and reply content fields. 57 + */ 58 + export function sanitizeHtml(input: string): string { 59 + if (input === '') return '' 60 + 61 + const normalized = normalizeText(input) 62 + 63 + return DOMPurify.sanitize(normalized, { 64 + ALLOWED_TAGS, 65 + ALLOWED_ATTR, 66 + ALLOW_DATA_ATTR: false, 67 + }) 68 + } 69 + 70 + /** 71 + * Sanitize plain text (strip all HTML). Used for topic titles. 72 + * Applies NFC normalization and strips bidi override characters. 73 + */ 74 + export function sanitizeText(input: string): string { 75 + if (input === '') return '' 76 + 77 + const normalized = normalizeText(input) 78 + 79 + return DOMPurify.sanitize(normalized, { 80 + ALLOWED_TAGS: [], 81 + ALLOWED_ATTR: [], 82 + }) 83 + }
+102
tests/unit/firehose/indexers/reply.test.ts
··· 99 99 }) 100 100 }) 101 101 102 + describe('sanitization', () => { 103 + it('sanitizes content (strips scripts) on create', async () => { 104 + const insertValuesMock = vi.fn().mockReturnValue({ 105 + onConflictDoNothing: vi.fn().mockResolvedValue(undefined), 106 + }) 107 + const mockTx = { 108 + insert: vi.fn().mockReturnValue({ values: insertValuesMock }), 109 + update: vi.fn().mockReturnValue({ 110 + set: vi.fn().mockReturnValue({ 111 + where: vi.fn().mockResolvedValue(undefined), 112 + }), 113 + }), 114 + } 115 + const db = { 116 + ...createMockDb(), 117 + transaction: vi 118 + .fn() 119 + .mockImplementation(async (fn: (tx: typeof mockTx) => Promise<void>) => fn(mockTx)), 120 + } 121 + const logger = createMockLogger() 122 + const indexer = new ReplyIndexer(db as never, logger as never) 123 + 124 + await indexer.handleCreate({ 125 + ...baseParams, 126 + record: { 127 + content: '<p>Reply</p><script>evil()</script>', 128 + root: { uri: 'at://did:plc:test/forum.barazo.topic.post/topic1', cid: 'bafytopic' }, 129 + parent: { uri: 'at://did:plc:test/forum.barazo.topic.post/topic1', cid: 'bafytopic' }, 130 + community: 'did:plc:community', 131 + createdAt: '2026-01-01T00:00:00.000Z', 132 + }, 133 + }) 134 + 135 + const values = insertValuesMock.mock.calls[0][0] as Record<string, unknown> 136 + expect(values.content).toContain('<p>Reply</p>') 137 + expect(values.content).not.toContain('<script>') 138 + }) 139 + 140 + it('sanitizes content on update', async () => { 141 + const setMock = vi.fn().mockReturnValue({ 142 + where: vi.fn().mockResolvedValue(undefined), 143 + }) 144 + const db = { 145 + ...createMockDb(), 146 + update: vi.fn().mockReturnValue({ set: setMock }), 147 + } 148 + const logger = createMockLogger() 149 + const indexer = new ReplyIndexer(db as never, logger as never) 150 + 151 + await indexer.handleUpdate({ 152 + ...baseParams, 153 + record: { 154 + content: '<p>Safe</p><iframe src="evil.com"></iframe>', 155 + root: { uri: 'at://did:plc:test/forum.barazo.topic.post/topic1', cid: 'bafytopic' }, 156 + parent: { uri: 'at://did:plc:test/forum.barazo.topic.post/topic1', cid: 'bafytopic' }, 157 + community: 'did:plc:community', 158 + createdAt: '2026-01-01T00:00:00.000Z', 159 + }, 160 + }) 161 + 162 + const setValues = setMock.mock.calls[0][0] as Record<string, unknown> 163 + expect(setValues.content).toContain('<p>Safe</p>') 164 + expect(setValues.content).not.toContain('<iframe') 165 + }) 166 + 167 + it('strips bidi override characters from content', async () => { 168 + const insertValuesMock = vi.fn().mockReturnValue({ 169 + onConflictDoNothing: vi.fn().mockResolvedValue(undefined), 170 + }) 171 + const mockTx = { 172 + insert: vi.fn().mockReturnValue({ values: insertValuesMock }), 173 + update: vi.fn().mockReturnValue({ 174 + set: vi.fn().mockReturnValue({ 175 + where: vi.fn().mockResolvedValue(undefined), 176 + }), 177 + }), 178 + } 179 + const db = { 180 + ...createMockDb(), 181 + transaction: vi 182 + .fn() 183 + .mockImplementation(async (fn: (tx: typeof mockTx) => Promise<void>) => fn(mockTx)), 184 + } 185 + const logger = createMockLogger() 186 + const indexer = new ReplyIndexer(db as never, logger as never) 187 + 188 + await indexer.handleCreate({ 189 + ...baseParams, 190 + record: { 191 + content: '<p>\u202AHello\u202E World\u200F</p>', 192 + root: { uri: 'at://did:plc:test/forum.barazo.topic.post/topic1', cid: 'bafytopic' }, 193 + parent: { uri: 'at://did:plc:test/forum.barazo.topic.post/topic1', cid: 'bafytopic' }, 194 + community: 'did:plc:community', 195 + createdAt: '2026-01-01T00:00:00.000Z', 196 + }, 197 + }) 198 + 199 + const values = insertValuesMock.mock.calls[0][0] as Record<string, unknown> 200 + expect(values.content).not.toMatch(/[\u202A-\u202E\u2066-\u2069\u200E\u200F]/) 201 + }) 202 + }) 203 + 102 204 describe('handleDelete', () => { 103 205 it('deletes a reply and decrements count in a transaction', async () => { 104 206 const db = createMockDb()
+90
tests/unit/firehose/indexers/topic.test.ts
··· 102 102 }) 103 103 }) 104 104 105 + describe('sanitization', () => { 106 + it('sanitizes title (strips HTML) and content (strips scripts) on create', async () => { 107 + const valuesMock = vi.fn().mockReturnValue({ 108 + onConflictDoUpdate: vi.fn().mockResolvedValue(undefined), 109 + }) 110 + const db = { 111 + ...createMockDb(), 112 + insert: vi.fn().mockReturnValue({ values: valuesMock }), 113 + } 114 + const logger = createMockLogger() 115 + const indexer = new TopicIndexer(db as never, logger as never) 116 + 117 + await indexer.handleCreate({ 118 + ...baseParams, 119 + record: { 120 + title: '<b>Bold</b> Title<script>alert("xss")</script>', 121 + content: '<p>Good</p><script>alert("xss")</script>', 122 + community: 'did:plc:community', 123 + category: 'general', 124 + createdAt: '2026-01-01T00:00:00.000Z', 125 + }, 126 + }) 127 + 128 + const values = valuesMock.mock.calls[0][0] as Record<string, unknown> 129 + // Title should have ALL HTML stripped (plain text) 130 + expect(values.title).not.toContain('<b>') 131 + expect(values.title).not.toContain('<script>') 132 + expect(values.title).toContain('Bold') 133 + // Content should keep safe tags but strip scripts 134 + expect(values.content).toContain('<p>Good</p>') 135 + expect(values.content).not.toContain('<script>') 136 + }) 137 + 138 + it('sanitizes title and content on update', async () => { 139 + const setMock = vi.fn().mockReturnValue({ 140 + where: vi.fn().mockResolvedValue(undefined), 141 + }) 142 + const db = { 143 + ...createMockDb(), 144 + update: vi.fn().mockReturnValue({ set: setMock }), 145 + } 146 + const logger = createMockLogger() 147 + const indexer = new TopicIndexer(db as never, logger as never) 148 + 149 + await indexer.handleUpdate({ 150 + ...baseParams, 151 + record: { 152 + title: 'Clean <img src=x onerror=alert(1)>', 153 + content: '<p>Safe</p><iframe src="evil.com"></iframe>', 154 + community: 'did:plc:community', 155 + category: 'general', 156 + createdAt: '2026-01-01T00:00:00.000Z', 157 + }, 158 + }) 159 + 160 + const setValues = setMock.mock.calls[0][0] as Record<string, unknown> 161 + expect(setValues.title).not.toContain('<img') 162 + expect(setValues.title).not.toContain('onerror') 163 + expect(setValues.content).toContain('<p>Safe</p>') 164 + expect(setValues.content).not.toContain('<iframe') 165 + }) 166 + 167 + it('strips bidi override characters from title and content', async () => { 168 + const valuesMock = vi.fn().mockReturnValue({ 169 + onConflictDoUpdate: vi.fn().mockResolvedValue(undefined), 170 + }) 171 + const db = { 172 + ...createMockDb(), 173 + insert: vi.fn().mockReturnValue({ values: valuesMock }), 174 + } 175 + const logger = createMockLogger() 176 + const indexer = new TopicIndexer(db as never, logger as never) 177 + 178 + await indexer.handleCreate({ 179 + ...baseParams, 180 + record: { 181 + title: '\u202AHello\u202E World', 182 + content: '<p>\u2066Content\u2069</p>', 183 + community: 'did:plc:community', 184 + category: 'general', 185 + createdAt: '2026-01-01T00:00:00.000Z', 186 + }, 187 + }) 188 + 189 + const values = valuesMock.mock.calls[0][0] as Record<string, unknown> 190 + expect(values.title).not.toMatch(/[\u202A-\u202E\u2066-\u2069]/) 191 + expect(values.content).not.toMatch(/[\u202A-\u202E\u2066-\u2069]/) 192 + }) 193 + }) 194 + 105 195 describe('handleDelete', () => { 106 196 it('soft-deletes a topic by URI', async () => { 107 197 const db = createMockDb()
+212
tests/unit/lib/sanitize.test.ts
··· 1 + import { describe, it, expect } from 'vitest' 2 + import { sanitizeHtml, sanitizeText } from '../../../src/lib/sanitize.js' 3 + 4 + describe('sanitize', () => { 5 + describe('sanitizeHtml', () => { 6 + it('returns empty string for empty input', () => { 7 + expect(sanitizeHtml('')).toBe('') 8 + }) 9 + 10 + it('preserves valid markdown-rendered HTML tags', () => { 11 + const input = 12 + '<p>Hello <strong>bold</strong> and <em>italic</em></p>' + 13 + '<blockquote>A quote</blockquote>' + 14 + '<ul><li>Item</li></ul>' + 15 + '<ol><li>Numbered</li></ol>' + 16 + '<pre><code>code block</code></pre>' + 17 + '<h1>Heading</h1><h2>H2</h2><h3>H3</h3><h4>H4</h4><h5>H5</h5><h6>H6</h6>' + 18 + '<hr>' + 19 + '<table><thead><tr><th>Col</th></tr></thead><tbody><tr><td>Cell</td></tr></tbody></table>' + 20 + '<del>strikethrough</del><sup>sup</sup><sub>sub</sub><br>' 21 + const result = sanitizeHtml(input) 22 + // All these tags should survive 23 + expect(result).toContain('<strong>') 24 + expect(result).toContain('<em>') 25 + expect(result).toContain('<blockquote>') 26 + expect(result).toContain('<ul>') 27 + expect(result).toContain('<ol>') 28 + expect(result).toContain('<li>') 29 + expect(result).toContain('<pre>') 30 + expect(result).toContain('<code>') 31 + expect(result).toContain('<h1>') 32 + expect(result).toContain('<table>') 33 + expect(result).toContain('<del>') 34 + expect(result).toContain('<sup>') 35 + expect(result).toContain('<sub>') 36 + expect(result).toContain('<br>') 37 + expect(result).toContain('<hr>') 38 + }) 39 + 40 + it('preserves allowed attributes on links', () => { 41 + const input = '<a href="https://example.com" rel="noopener noreferrer">Link</a>' 42 + const result = sanitizeHtml(input) 43 + expect(result).toContain('href="https://example.com"') 44 + expect(result).toContain('rel="noopener noreferrer"') 45 + }) 46 + 47 + it('preserves img tags with src and alt', () => { 48 + const input = '<img src="https://example.com/img.png" alt="Photo">' 49 + const result = sanitizeHtml(input) 50 + expect(result).toContain('src="https://example.com/img.png"') 51 + expect(result).toContain('alt="Photo"') 52 + }) 53 + 54 + it('strips script tags', () => { 55 + const input = '<p>Hello</p><script>alert("xss")</script>' 56 + const result = sanitizeHtml(input) 57 + expect(result).not.toContain('<script>') 58 + expect(result).not.toContain('alert') 59 + expect(result).toContain('<p>Hello</p>') 60 + }) 61 + 62 + it('strips onerror attributes from img tags', () => { 63 + const input = '<img src="x" onerror="alert(1)">' 64 + const result = sanitizeHtml(input) 65 + expect(result).not.toContain('onerror') 66 + expect(result).not.toContain('alert') 67 + }) 68 + 69 + it('strips javascript: protocol from href', () => { 70 + 71 + const input = '<a href="javascript:alert(1)">click</a>' 72 + const result = sanitizeHtml(input) 73 + expect(result).not.toContain('javascript:') 74 + }) 75 + 76 + it('strips iframe tags', () => { 77 + const input = '<iframe src="https://evil.com"></iframe><p>Safe</p>' 78 + const result = sanitizeHtml(input) 79 + expect(result).not.toContain('<iframe') 80 + expect(result).toContain('<p>Safe</p>') 81 + }) 82 + 83 + it('strips style tags', () => { 84 + const input = '<style>body { display: none }</style><p>Visible</p>' 85 + const result = sanitizeHtml(input) 86 + expect(result).not.toContain('<style') 87 + expect(result).toContain('<p>Visible</p>') 88 + }) 89 + 90 + it('strips data attributes', () => { 91 + const input = '<p data-tracking="abc123">Text</p>' 92 + const result = sanitizeHtml(input) 93 + expect(result).not.toContain('data-tracking') 94 + expect(result).toContain('Text') 95 + }) 96 + 97 + it('strips on* event handler attributes', () => { 98 + const input = '<p onclick="alert(1)" onmouseover="alert(2)">Text</p>' 99 + const result = sanitizeHtml(input) 100 + expect(result).not.toContain('onclick') 101 + expect(result).not.toContain('onmouseover') 102 + }) 103 + 104 + it('strips form and input tags', () => { 105 + const input = '<form action="/steal"><input type="text"><button>Submit</button></form>' 106 + const result = sanitizeHtml(input) 107 + expect(result).not.toContain('<form') 108 + expect(result).not.toContain('<input') 109 + }) 110 + 111 + it('applies NFC normalization', () => { 112 + // U+0065 (e) + U+0301 (combining acute accent) = NFD form of e-acute 113 + // NFC normalizes to U+00E9 (e-acute precomposed) 114 + const nfd = 'caf\u0065\u0301' // "café" in NFD 115 + const nfc = 'caf\u00E9' // "café" in NFC 116 + const result = sanitizeHtml(`<p>${nfd}</p>`) 117 + expect(result).toContain(nfc) 118 + }) 119 + 120 + it('strips bidirectional override characters', () => { 121 + // U+202A (LRE), U+202B (RLE), U+202C (PDF), U+202D (LRO), U+202E (RLO) 122 + // U+2066 (LRI), U+2067 (RLI), U+2068 (FSI), U+2069 (PDI) 123 + // U+200E (LRM), U+200F (RLM) 124 + const bidiChars = '\u202A\u202B\u202C\u202D\u202E\u2066\u2067\u2068\u2069\u200E\u200F' 125 + const input = `<p>Hello${bidiChars}World</p>` 126 + const result = sanitizeHtml(input) 127 + expect(result).not.toMatch(/[\u202A-\u202E\u2066-\u2069\u200E\u200F]/) 128 + expect(result).toContain('HelloWorld') 129 + }) 130 + 131 + it('handles combined bidi + script injection', () => { 132 + const input = '<p>\u202EHello</p><script>alert("xss")</script>' 133 + const result = sanitizeHtml(input) 134 + expect(result).not.toContain('<script>') 135 + expect(result).not.toContain('\u202E') 136 + }) 137 + 138 + it('handles very large input without throwing', () => { 139 + const large = '<p>' + 'A'.repeat(100_000) + '</p>' 140 + const result = sanitizeHtml(large) 141 + expect(result).toContain('<p>') 142 + expect(result.length).toBeGreaterThan(0) 143 + }) 144 + 145 + it('preserves plain text without modification (after normalization)', () => { 146 + const input = 'Just plain text with no HTML' 147 + const result = sanitizeHtml(input) 148 + expect(result).toBe('Just plain text with no HTML') 149 + }) 150 + }) 151 + 152 + describe('sanitizeText', () => { 153 + it('returns empty string for empty input', () => { 154 + expect(sanitizeText('')).toBe('') 155 + }) 156 + 157 + it('strips all HTML tags', () => { 158 + const input = '<b>Bold</b> and <script>evil()</script>' 159 + const result = sanitizeText(input) 160 + expect(result).not.toContain('<') 161 + expect(result).not.toContain('>') 162 + expect(result).toContain('Bold') 163 + expect(result).not.toContain('evil') 164 + }) 165 + 166 + it('preserves plain text', () => { 167 + const input = 'How to configure PostgreSQL?' 168 + expect(sanitizeText(input)).toBe('How to configure PostgreSQL?') 169 + }) 170 + 171 + it('applies NFC normalization', () => { 172 + const nfd = 'caf\u0065\u0301' 173 + const nfc = 'caf\u00E9' 174 + expect(sanitizeText(nfd)).toBe(nfc) 175 + }) 176 + 177 + it('strips bidirectional override characters', () => { 178 + const input = '\u202AHello\u202E World\u200F' 179 + const result = sanitizeText(input) 180 + expect(result).toBe('Hello World') 181 + }) 182 + 183 + it('strips HTML from titles with injection attempts', () => { 184 + const input = 'Topic <img src=x onerror=alert(1)> Title' 185 + const result = sanitizeText(input) 186 + expect(result).not.toContain('<img') 187 + expect(result).not.toContain('onerror') 188 + expect(result).toContain('Topic') 189 + expect(result).toContain('Title') 190 + }) 191 + 192 + it('handles very large input without throwing', () => { 193 + const large = 'A'.repeat(100_000) 194 + const result = sanitizeText(large) 195 + expect(result.length).toBe(100_000) 196 + }) 197 + 198 + it('strips nested HTML tags', () => { 199 + const input = '<div><p><b>Nested</b></p></div>' 200 + const result = sanitizeText(input) 201 + expect(result).not.toContain('<') 202 + expect(result).toContain('Nested') 203 + }) 204 + 205 + it('handles homoglyph-style text (NFC normalization of composed chars)', () => { 206 + // Latin Small Letter A with Ring Above: U+0061 + U+030A -> U+00E5 207 + const decomposed = '\u0061\u030A' 208 + const composed = '\u00E5' 209 + expect(sanitizeText(decomposed)).toBe(composed) 210 + }) 211 + }) 212 + })