feat: handle code blocks · dunkirk.sh/herald@e88a96b

dunkirk.sh / herald

fork atom

rss email digests over ssh because you're a cool kid herald.dunkirk.sh

go rss rss-reader ssh charm

fork atom

feat: handle code blocks

dunkirk.sh 1 month ago e88a96bc bf4828ad

verified

+116 -10

2 changed files

expand all

render.go

render_test.go

+69 -10

email/render.go

··· 3 3 import ( 4 4 "bytes" 5 5 "embed" 6 + "fmt" 6 7 htmltemplate "html/template" 7 8 "regexp" 8 9 "strings" ··· 54 55 // emailUnsafeTags are HTML5 semantic tags not supported by most email clients (Gmail, Outlook, etc.) 55 56 var emailUnsafeTags = regexp.MustCompile(`</?(?:article|section|nav|header|footer|aside|main|figure|figcaption|details|summary|mark|time|dialog)(?:\s[^>]*)?>`) 56 57 58 + // spanTags matches span tags (used to strip syntax highlighting noise from code blocks) 59 + var spanTags = regexp.MustCompile(`</?span(?:\s[^>]*)?>`) 60 + 61 + // preTagOpen matches opening pre tags to add styling 62 + var preTagOpen = regexp.MustCompile(`<pre(?:\s[^>]*)?>`) 63 + 64 + // codeBlockStyle is inline CSS for code blocks in emails 65 + const codeBlockStyle = `<pre style="background-color:#f5f5f5;padding:12px;border-radius:4px;overflow-x:auto;font-family:monospace;font-size:13px;line-height:1.4">` 66 + 57 67 // sanitizeHTML sanitizes HTML content, allowing safe tags while stripping styles and unsafe elements 58 68 func sanitizeHTML(html string) string { 59 69 sanitized := policy.Sanitize(html) 60 70 // Strip HTML5 semantic tags that email clients don't support 61 - return emailUnsafeTags.ReplaceAllString(sanitized, "") 71 + sanitized = emailUnsafeTags.ReplaceAllString(sanitized, "") 72 + // Strip span tags (removes syntax highlighting noise from code blocks) 73 + sanitized = spanTags.ReplaceAllString(sanitized, "") 74 + // Add styling to pre tags for better code block appearance 75 + sanitized = preTagOpen.ReplaceAllString(sanitized, codeBlockStyle) 76 + return sanitized 62 77 } 63 78 64 79 // htmlTagRegex matches HTML tags for stripping 65 80 var htmlTagRegex = regexp.MustCompile(`<[^>]*>`) 66 81 82 + // preBlockRegex matches pre blocks including content 83 + var preBlockRegex = regexp.MustCompile(`(?s)<pre[^>]*>(.*?)</pre>`) 84 + 85 + // whitespaceCollapse collapses multiple whitespace chars 86 + var whitespaceCollapse = regexp.MustCompile(`[ \t]+`) 87 + 88 + // multipleNewlines collapses 3+ newlines to 2 89 + var multipleNewlines = regexp.MustCompile(`\n{3,}`) 90 + 91 + // decodeEntities decodes common HTML entities 92 + func decodeEntities(text string) string { 93 + text = strings.ReplaceAll(text, "&", "&") 94 + text = strings.ReplaceAll(text, "<", "<") 95 + text = strings.ReplaceAll(text, ">", ">") 96 + text = strings.ReplaceAll(text, """, "\"") 97 + text = strings.ReplaceAll(text, "'", "'") 98 + text = strings.ReplaceAll(text, " ", " ") 99 + return text 100 + } 101 + 67 102 // stripHTML removes all HTML tags and decodes entities for plain text output 68 103 func stripHTML(html string) string { 69 104 // First sanitize to ensure we're working with clean HTML 70 105 sanitized := policy.Sanitize(html) 106 + 107 + // Extract code blocks and replace with placeholders 108 + var codeBlocks []string 109 + sanitized = preBlockRegex.ReplaceAllStringFunc(sanitized, func(match string) string { 110 + inner := preBlockRegex.FindStringSubmatch(match) 111 + if len(inner) < 2 { 112 + return match 113 + } 114 + code := inner[1] 115 + // Strip any remaining tags (like spans for syntax highlighting) 116 + code = htmlTagRegex.ReplaceAllString(code, "") 117 + code = decodeEntities(code) 118 + // Indent each line with 4 spaces 119 + lines := strings.Split(strings.TrimSpace(code), "\n") 120 + for i, line := range lines { 121 + lines[i] = " " + line 122 + } 123 + codeBlocks = append(codeBlocks, strings.Join(lines, "\n")) 124 + return fmt.Sprintf("\n\n__CODEBLOCK_%d__\n\n", len(codeBlocks)-1) 125 + }) 126 + 71 127 // Strip all remaining HTML tags 72 128 text := htmlTagRegex.ReplaceAllString(sanitized, "") 73 - // Decode common HTML entities 74 - text = strings.ReplaceAll(text, "&", "&") 75 - text = strings.ReplaceAll(text, "<", "<") 76 - text = strings.ReplaceAll(text, ">", ">") 77 - text = strings.ReplaceAll(text, """, "\"") 78 - text = strings.ReplaceAll(text, "'", "'") 79 - text = strings.ReplaceAll(text, " ", " ") 80 - // Collapse multiple whitespace/newlines 81 - text = regexp.MustCompile(`\s+`).ReplaceAllString(text, " ") 129 + // Decode entities 130 + text = decodeEntities(text) 131 + // Collapse horizontal whitespace (but preserve newlines for structure) 132 + text = whitespaceCollapse.ReplaceAllString(text, " ") 133 + // Collapse excessive newlines 134 + text = multipleNewlines.ReplaceAllString(text, "\n\n") 135 + 136 + // Restore code blocks 137 + for i, block := range codeBlocks { 138 + text = strings.ReplaceAll(text, fmt.Sprintf("__CODEBLOCK_%d__", i), block) 139 + } 140 + 82 141 return strings.TrimSpace(text) 83 142 } 84 143

+47

email/render_test.go

··· 134 134 t.Error("Text content was not preserved after HTML stripping") 135 135 } 136 136 } 137 + 138 + func TestRenderDigest_CodeBlockFormatting(t *testing.T) { 139 + data := &DigestData{ 140 + ConfigName: "Test Config", 141 + TotalItems: 1, 142 + FeedGroups: []FeedGroup{ 143 + { 144 + FeedName: "Test Feed", 145 + FeedURL: "https://example.com/feed", 146 + Items: []FeedItem{ 147 + { 148 + Title: "Test Article", 149 + Link: "https://example.com/article", 150 + Content: `Code example:<pre># comment 151 + echo hello</pre>Done.`, 152 + Published: time.Now(), 153 + }, 154 + }, 155 + }, 156 + }, 157 + } 158 + 159 + htmlOutput, textOutput, err := RenderDigest(data, true, 30, false, false) 160 + if err != nil { 161 + t.Fatalf("RenderDigest failed: %v", err) 162 + } 163 + 164 + // HTML: verify code block has styling 165 + if !strings.Contains(htmlOutput, `<pre style="background-color:#f5f5f5`) { 166 + t.Error("HTML code block missing styling") 167 + } 168 + 169 + // HTML: verify syntax highlighting spans are stripped 170 + if strings.Contains(htmlOutput, `class="c1"`) { 171 + t.Error("Syntax highlighting classes should be stripped") 172 + } 173 + 174 + // Text: verify code is indented 175 + if !strings.Contains(textOutput, " # comment") { 176 + t.Error("Text code block should be indented with 4 spaces") 177 + } 178 + 179 + // Text: verify no HTML tags in code block 180 + if strings.Contains(textOutput, "<span") || strings.Contains(textOutput, "<pre") { 181 + t.Error("Text output should not contain HTML tags") 182 + } 183 + }