Weighs the soul of incoming HTTP requests to stop AI crawlers

feat(config): add ability to customize HTTP status codes Anubis returns (#393)

Signed-off-by: Xe Iaso <me@xeiaso.net>

authored by

Xe Iaso and committed by
GitHub
74d330ce 2935bd4a

+242 -9
+8
data/botPolicies.yaml
··· 48 48 action: CHALLENGE 49 49 50 50 dnsbl: false 51 + 52 + # By default, send HTTP 200 back to clients that either get issued a challenge 53 + # or a denial. This seems weird, but this is load-bearing due to the fact that 54 + # the most aggressive scraper bots seem to really really want an HTTP 200 and 55 + # will stop sending requests once they get it. 56 + status_codes: 57 + CHALLENGE: 200 58 + DENY: 200
+2
docs/docs/CHANGELOG.md
··· 11 11 12 12 ## [Unreleased] 13 13 14 + - Added the ability to [customize Anubis' HTTP status codes](./admin/configuration/custom-status-codes.mdx) ([#355](https://github.com/TecharoHQ/anubis/issues/355)) 15 + 14 16 ## v1.17.0: Asahi sas Brutus 15 17 16 18 - Ensure regexes can't end in newlines ([#372](https://github.com/TecharoHQ/anubis/issues/372))
+19
docs/docs/admin/configuration/custom-status-codes.mdx
··· 1 + # Custom status codes for Anubis errors 2 + 3 + Out of the box, Anubis will reply with `HTTP 200` for challenge and denial pages. This is intended to make AI scrapers have a hard time with your website because when they are faced with a non-200 response, they will hammer the page over and over until they get a 200 response. This behavior may not be desirable, as such Anubis lets you customize what HTTP status codes are returned when Anubis throws challenge and denial pages. 4 + 5 + This is configured in the `status_codes` block of your [bot policy file](../policies.mdx): 6 + 7 + ```yaml 8 + status_codes: 9 + CHALLENGE: 200 10 + DENY: 200 11 + ``` 12 + 13 + To match CloudFlare's behavior, use a configuration like this: 14 + 15 + ```yaml 16 + status_codes: 17 + CHALLENGE: 403 18 + DENY: 403 19 + ```
+2 -2
lib/anubis.go
··· 170 170 hash := rule.Hash() 171 171 172 172 lg.Debug("rule hash", "hash", hash) 173 - s.respondWithStatus(w, r, fmt.Sprintf("Access Denied: error code %s", hash), http.StatusOK) 173 + s.respondWithStatus(w, r, fmt.Sprintf("Access Denied: error code %s", hash), s.policy.StatusCodes.Deny) 174 174 return true 175 175 case config.RuleChallenge: 176 176 lg.Debug("challenge requested") ··· 202 202 203 203 if resp != dnsbl.AllGood { 204 204 lg.Info("DNSBL hit", "status", resp.String()) 205 - s.respondWithStatus(w, r, fmt.Sprintf("DroneBL reported an entry: %s, see https://dronebl.org/lookup?ip=%s", resp.String(), ip), http.StatusOK) 205 + s.respondWithStatus(w, r, fmt.Sprintf("DroneBL reported an entry: %s, see https://dronebl.org/lookup?ip=%s", resp.String(), ip), s.policy.StatusCodes.Deny) 206 206 return true 207 207 } 208 208 }
+45
lib/anubis_test.go
··· 393 393 }) 394 394 } 395 395 } 396 + 397 + func TestCustomStatusCodes(t *testing.T) { 398 + h := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { 399 + t.Log(r.UserAgent()) 400 + w.WriteHeader(http.StatusOK) 401 + fmt.Fprintln(w, "OK") 402 + }) 403 + 404 + statusMap := map[string]int{ 405 + "ALLOW": 200, 406 + "CHALLENGE": 401, 407 + "DENY": 403, 408 + } 409 + 410 + pol := loadPolicies(t, "./testdata/aggressive_403.yaml") 411 + pol.DefaultDifficulty = 4 412 + 413 + srv := spawnAnubis(t, Options{ 414 + Next: h, 415 + Policy: pol, 416 + }) 417 + 418 + ts := httptest.NewServer(internal.RemoteXRealIP(true, "tcp", srv)) 419 + defer ts.Close() 420 + 421 + for userAgent, statusCode := range statusMap { 422 + t.Run(userAgent, func(t *testing.T) { 423 + req, err := http.NewRequestWithContext(t.Context(), http.MethodGet, ts.URL, nil) 424 + if err != nil { 425 + t.Fatal(err) 426 + } 427 + 428 + req.Header.Set("User-Agent", userAgent) 429 + 430 + resp, err := ts.Client().Do(req) 431 + if err != nil { 432 + t.Fatal(err) 433 + } 434 + 435 + if resp.StatusCode != statusCode { 436 + t.Errorf("wanted status code %d but got: %d", statusCode, resp.StatusCode) 437 + } 438 + }) 439 + } 440 + }
+4 -1
lib/http.go
··· 67 67 return 68 68 } 69 69 70 - handler := internal.NoStoreCache(templ.Handler(component)) 70 + handler := internal.NoStoreCache(templ.Handler( 71 + component, 72 + templ.WithStatus(s.opts.Policy.StatusCodes.Challenge), 73 + )) 71 74 handler.ServeHTTP(w, r) 72 75 } 73 76
+41 -5
lib/policy/config/config.go
··· 6 6 "io" 7 7 "io/fs" 8 8 "net" 9 + "net/http" 9 10 "os" 10 11 "regexp" 11 12 "strings" ··· 28 29 ErrInvalidImportStatement = errors.New("config.ImportStatement: invalid source file") 29 30 ErrCantSetBotAndImportValuesAtOnce = errors.New("config.BotOrImport: can't set bot rules and import values at the same time") 30 31 ErrMustSetBotOrImportRules = errors.New("config.BotOrImport: rule definition is invalid, you must set either bot rules or an import statement, not both") 32 + ErrStatusCodeNotValid = errors.New("config.StatusCode: status code not valid, must be between 100 and 599") 31 33 ) 32 34 33 35 type Rule string ··· 262 264 return ErrMustSetBotOrImportRules 263 265 } 264 266 267 + type StatusCodes struct { 268 + Challenge int `json:"CHALLENGE"` 269 + Deny int `json:"DENY"` 270 + } 271 + 272 + func (sc StatusCodes) Valid() error { 273 + var errs []error 274 + 275 + if sc.Challenge == 0 || (sc.Challenge < 100 && sc.Challenge >= 599) { 276 + errs = append(errs, fmt.Errorf("%w: challenge is %d", ErrStatusCodeNotValid, sc.Challenge)) 277 + } 278 + 279 + if sc.Deny == 0 || (sc.Deny < 100 && sc.Deny >= 599) { 280 + errs = append(errs, fmt.Errorf("%w: deny is %d", ErrStatusCodeNotValid, sc.Deny)) 281 + } 282 + 283 + if len(errs) != 0 { 284 + return fmt.Errorf("status codes not valid:\n%w", errors.Join(errs...)) 285 + } 286 + 287 + return nil 288 + } 289 + 265 290 type fileConfig struct { 266 - Bots []BotOrImport `json:"bots"` 267 - DNSBL bool `json:"dnsbl"` 291 + Bots []BotOrImport `json:"bots"` 292 + DNSBL bool `json:"dnsbl"` 293 + StatusCodes StatusCodes `json:"status_codes"` 268 294 } 269 295 270 296 func (c fileConfig) Valid() error { ··· 280 306 } 281 307 } 282 308 309 + if err := c.StatusCodes.Valid(); err != nil { 310 + errs = append(errs, err) 311 + } 312 + 283 313 if len(errs) != 0 { 284 314 return fmt.Errorf("config is not valid:\n%w", errors.Join(errs...)) 285 315 } ··· 289 319 290 320 func Load(fin io.Reader, fname string) (*Config, error) { 291 321 var c fileConfig 322 + c.StatusCodes = StatusCodes{ 323 + Challenge: http.StatusOK, 324 + Deny: http.StatusOK, 325 + } 292 326 if err := yaml.NewYAMLToJSONDecoder(fin).Decode(&c); err != nil { 293 327 return nil, fmt.Errorf("can't parse policy config YAML %s: %w", fname, err) 294 328 } ··· 298 332 } 299 333 300 334 result := &Config{ 301 - DNSBL: c.DNSBL, 335 + DNSBL: c.DNSBL, 336 + StatusCodes: c.StatusCodes, 302 337 } 303 338 304 339 var validationErrs []error ··· 331 366 } 332 367 333 368 type Config struct { 334 - Bots []BotConfig 335 - DNSBL bool 369 + Bots []BotConfig 370 + DNSBL bool 371 + StatusCodes StatusCodes 336 372 } 337 373 338 374 func (c Config) Valid() error {
+13
lib/policy/config/testdata/bad/status-codes-0.json
··· 1 + { 2 + "bots": [ 3 + { 4 + "name": "everything", 5 + "user_agent_regex": ".*", 6 + "action": "DENY" 7 + } 8 + ], 9 + "status_codes": { 10 + "CHALLENGE": 0, 11 + "DENY": 0 12 + } 13 + }
+8
lib/policy/config/testdata/bad/status-codes-0.yaml
··· 1 + bots: 2 + - name: everything 3 + user_agent_regex: .* 4 + action: DENY 5 + 6 + status_codes: 7 + CHALLENGE: 0 8 + DENY: 0
+13
lib/policy/config/testdata/good/status-codes-paranoid.json
··· 1 + { 2 + "bots": [ 3 + { 4 + "name": "everything", 5 + "user_agent_regex": ".*", 6 + "action": "DENY" 7 + } 8 + ], 9 + "status_codes": { 10 + "CHALLENGE": 200, 11 + "DENY": 200 12 + } 13 + }
+8
lib/policy/config/testdata/good/status-codes-paranoid.yaml
··· 1 + bots: 2 + - name: everything 3 + user_agent_regex: .* 4 + action: DENY 5 + 6 + status_codes: 7 + CHALLENGE: 200 8 + DENY: 200
+13
lib/policy/config/testdata/good/status-codes-rfc.json
··· 1 + { 2 + "bots": [ 3 + { 4 + "name": "everything", 5 + "user_agent_regex": ".*", 6 + "action": "DENY" 7 + } 8 + ], 9 + "status_codes": { 10 + "CHALLENGE": 403, 11 + "DENY": 403 12 + } 13 + }
+8
lib/policy/config/testdata/good/status-codes-rfc.yaml
··· 1 + bots: 2 + - name: everything 3 + user_agent_regex: .* 4 + action: DENY 5 + 6 + status_codes: 7 + CHALLENGE: 403 8 + DENY: 403
+3 -1
lib/policy/policy.go
··· 24 24 Bots []Bot 25 25 DNSBL bool 26 26 DefaultDifficulty int 27 + StatusCodes config.StatusCodes 27 28 } 28 29 29 30 func NewParsedConfig(orig *config.Config) *ParsedConfig { 30 31 return &ParsedConfig{ 31 - orig: orig, 32 + orig: orig, 33 + StatusCodes: orig.StatusCodes, 32 34 } 33 35 } 34 36
+12
lib/testdata/aggressive_403.yaml
··· 1 + bots: 2 + - name: deny 3 + user_agent_regex: DENY 4 + action: DENY 5 + 6 + - name: challenge 7 + user_agent_regex: CHALLENGE 8 + action: CHALLENGE 9 + 10 + status_codes: 11 + CHALLENGE: 401 12 + DENY: 403
+12
test/anubis_configs/aggressive_403.yaml
··· 1 + bots: 2 + - name: deny 3 + user_agent_regex: DENY 4 + action: DENY 5 + 6 + - name: challenge 7 + user_agent_regex: CHALLENGE 8 + action: CHALLENGE 9 + 10 + status_codes: 11 + CHALLENGE: 401 12 + DENY: 403
+1
test/unix-socket-xff/start.sh
··· 37 37 go tool anubis \ 38 38 --bind=./anubis.sock \ 39 39 --bind-network=unix \ 40 + --policy-fname=../anubis_configs/aggressive_403.yaml \ 40 41 --target=unix://$(pwd)/unixhttpd.sock & 41 42 42 43 # A simple TLS terminator that forwards to Anubis, which will forward to
+30
test/unix-socket-xff/test.mjs
··· 1 + async function testWithUserAgent(userAgent) { 2 + const statusCode = 3 + await fetch("https://relayd.local.cetacean.club:3004/reqmeta", { 4 + headers: { 5 + "User-Agent": userAgent, 6 + } 7 + }) 8 + .then(resp => resp.status); 9 + return statusCode; 10 + } 11 + 12 + const codes = { 13 + allow: await testWithUserAgent("ALLOW"), 14 + challenge: await testWithUserAgent("CHALLENGE"), 15 + deny: await testWithUserAgent("DENY") 16 + } 17 + 18 + const expected = { 19 + allow: 200, 20 + challenge: 401, 21 + deny: 403, 22 + }; 23 + 24 + console.log("ALLOW: ", codes.allow); 25 + console.log("CHALLENGE:", codes.challenge); 26 + console.log("DENY: ", codes.deny); 27 + 28 + if (JSON.stringify(codes) !== JSON.stringify(expected)) { 29 + throw new Error(`wanted ${JSON.stringify(expected)}, got: ${JSON.stringify(codes)}`); 30 + }