Weighs the soul of incoming HTTP requests to stop AI crawlers

feat(config): allow multi-level imports (#402)

* feat(config): allow multi-level imports

Signed-off-by: Xe Iaso <me@xeiaso.net>

* chore(data): fix spelling of Marginalia

Signed-off-by: Xe Iaso <me@xeiaso.net>

---------

Signed-off-by: Xe Iaso <me@xeiaso.net>

authored by

Xe Iaso and committed by
GitHub
6e823737 f8e1000a

+88 -49
+9 -32
data/botPolicies.json
··· 1 1 { 2 2 "bots": [ 3 3 { 4 - "import": "(data)/bots/ai-robots-txt.yaml" 4 + "import": "(data)/bots/_deny-pathological.yaml" 5 5 }, 6 6 { 7 - "import": "(data)/bots/cloudflare-workers.yaml" 7 + "import": "(data)/bots/ai-robots-txt.yaml" 8 8 }, 9 9 { 10 - "import": "(data)/bots/headless-browsers.yaml" 11 - }, 12 - { 13 - "import": "(data)/bots/us-ai-scraper.yaml" 14 - }, 15 - { 16 - "import": "(data)/crawlers/googlebot.yaml" 17 - }, 18 - { 19 - "import": "(data)/crawlers/bingbot.yaml" 20 - }, 21 - { 22 - "import": "(data)/crawlers/duckduckbot.yaml" 23 - }, 24 - { 25 - "import": "(data)/crawlers/qwantbot.yaml" 26 - }, 27 - { 28 - "import": "(data)/crawlers/internet-archive.yaml" 29 - }, 30 - { 31 - "import": "(data)/crawlers/kagibot.yaml" 32 - }, 33 - { 34 - "import": "(data)/crawlers/marginalia.yaml" 35 - }, 36 - { 37 - "import": "(data)/crawlers/mojeekbot.yaml" 10 + "import": "(data)/crawlers/_allow-good.yaml" 38 11 }, 39 12 { 40 13 "import": "(data)/common/keep-internet-working.yaml" ··· 45 18 "action": "CHALLENGE" 46 19 } 47 20 ], 48 - "dnsbl": false 49 - } 21 + "dnsbl": false, 22 + "status_codes": { 23 + "CHALLENGE": 200, 24 + "DENY": 200 25 + } 26 + }
+16 -14
data/botPolicies.yaml
··· 12 12 13 13 bots: 14 14 # Pathological bots to deny 15 - - # This correlates to data/bots/ai-robots-txt.yaml in the source tree 16 - import: (data)/bots/ai-robots-txt.yaml 17 - - import: (data)/bots/cloudflare-workers.yaml 18 - - import: (data)/bots/headless-browsers.yaml 19 - - import: (data)/bots/us-ai-scraper.yaml 15 + - # This correlates to data/bots/deny-pathological.yaml in the source tree 16 + # https://github.com/TecharoHQ/anubis/blob/main/data/bots/deny-pathological.yaml 17 + import: (data)/bots/_deny-pathological.yaml 20 18 21 - # Search engines to allow 22 - - import: (data)/crawlers/googlebot.yaml 23 - - import: (data)/crawlers/bingbot.yaml 24 - - import: (data)/crawlers/duckduckbot.yaml 25 - - import: (data)/crawlers/qwantbot.yaml 26 - - import: (data)/crawlers/internet-archive.yaml 27 - - import: (data)/crawlers/kagibot.yaml 28 - - import: (data)/crawlers/marginalia.yaml 29 - - import: (data)/crawlers/mojeekbot.yaml 19 + # Enforce https://github.com/ai-robots-txt/ai.robots.txt 20 + - import: (data)/bots/ai-robots-txt.yaml 21 + 22 + # Search engine crawlers to allow, defaults to: 23 + # - Google (so they don't try to bypass Anubis) 24 + # - Bing 25 + # - DuckDuckGo 26 + # - Qwant 27 + # - The Internet Archive 28 + # - Kagi 29 + # - Marginalia 30 + # - Mojeek 31 + - import: (data)/crawlers/_allow-good.yaml 30 32 31 33 # Allow common "keeping the internet working" routes (well-known, favicon, robots.txt) 32 34 - import: (data)/common/keep-internet-working.yaml
+3
data/bots/_deny-pathological.yaml
··· 1 + - import: (data)/bots/cloudflare-workers.yaml 2 + - import: (data)/bots/headless-browsers.yaml 3 + - import: (data)/bots/us-ai-scraper.yaml
+8
data/crawlers/_allow-good.yaml
··· 1 + - import: (data)/crawlers/googlebot.yaml 2 + - import: (data)/crawlers/bingbot.yaml 3 + - import: (data)/crawlers/duckduckbot.yaml 4 + - import: (data)/crawlers/qwantbot.yaml 5 + - import: (data)/crawlers/internet-archive.yaml 6 + - import: (data)/crawlers/kagibot.yaml 7 + - import: (data)/crawlers/marginalia.yaml 8 + - import: (data)/crawlers/mojeekbot.yaml
+1 -1
data/embed.go
··· 3 3 import "embed" 4 4 5 5 var ( 6 - //go:embed botPolicies.yaml botPolicies.json apps bots common crawlers 6 + //go:embed botPolicies.yaml botPolicies.json all:apps all:bots all:common all:crawlers 7 7 BotPolicies embed.FS 8 8 )
+1
docs/docs/CHANGELOG.md
··· 18 18 - Added customization of authorization cookie expiration time with `--cookie-expiration-time` flag or envvar 19 19 - Updated the `OG_PASSTHROUGH` to be true by default, thereby allowing OpenGraph tags to be passed through by default 20 20 - Added the ability to [customize Anubis' HTTP status codes](./admin/configuration/custom-status-codes.mdx) ([#355](https://github.com/TecharoHQ/anubis/issues/355)) 21 + - Change import syntax to allow multi-level imports 21 22 22 23 ## v1.17.0: Asahi sas Brutus 23 24
+39
docs/docs/admin/configuration/import.mdx
··· 79 79 80 80 Paths can either be prefixed with `(data)` to import from the [the data folder in the Anubis source tree](https://github.com/TecharoHQ/anubis/tree/main/data) or anywhere on the filesystem. If you don't have access to the Anubis source tree, check /usr/share/docs/anubis/data or in the tarball you extracted Anubis from. 81 81 82 + ## Importing from imports 83 + 84 + You can also import from an imported file in case you want to import an entire folder of rules at once. 85 + 86 + <Tabs> 87 + <TabItem value="json" label="JSON"> 88 + 89 + ```json 90 + { 91 + "bots": [ 92 + { 93 + "import": "(data)/bots/_deny-pathological.yaml" 94 + } 95 + ] 96 + } 97 + ``` 98 + 99 + </TabItem> 100 + <TabItem value="yaml" label="YAML" default> 101 + 102 + ```yaml 103 + bots: 104 + - import: (data)/bots/_deny-pathological.yaml 105 + ``` 106 + 107 + </TabItem> 108 + </Tabs> 109 + 110 + This lets you import an entire ruleset at once: 111 + 112 + ```yaml 113 + # (data)/bots/_deny-pathological.yaml 114 + - import: (data)/bots/cloudflare-workers.yaml 115 + - import: (data)/bots/headless-browsers.yaml 116 + - import: (data)/bots/us-ai-scraper.yaml 117 + ``` 118 + 119 + Use this with care, you can easily get yourself into a state where Anubis recursively imports things for eternity if you are not careful. The best way to use this is to make a "root import" named `_everything.yaml` or `_allow-good.yaml` so they sort to the top. Name your meta-imports after the main verb they are enforcing so that you can glance at the configuration file and understand what it's doing. 120 + 82 121 ## Writing snippets 83 122 84 123 Snippets can be written in either JSON or YAML, with a preference for YAML. When writing a snippet, write the bot rules you want directly at the top level of the file in a list.
+11 -2
lib/policy/config/config.go
··· 216 216 } 217 217 defer fin.Close() 218 218 219 + var imported []BotOrImport 219 220 var result []BotConfig 220 221 221 - if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&result); err != nil { 222 + if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&imported); err != nil { 222 223 return fmt.Errorf("can't parse %s: %w", is.Import, err) 223 224 } 224 225 225 226 var errs []error 226 227 227 - for _, b := range result { 228 + for _, b := range imported { 228 229 if err := b.Valid(); err != nil { 229 230 errs = append(errs, err) 231 + } 232 + 233 + if b.ImportStatement != nil { 234 + result = append(result, b.ImportStatement.Bots...) 235 + } 236 + 237 + if b.BotConfig != nil { 238 + result = append(result, *b.BotConfig) 230 239 } 231 240 } 232 241