add more backill knobs to reduce memory consumption in container env

+34

docs/deployment.md

··· 251 251 docker compose logs -f quickslice 252 252 ``` 253 253 254 + ## Backfill Configuration 255 + 256 + Control memory usage during backfill operations with these environment variables: 257 + 258 + | Variable | Default | Description | 259 + |----------|---------|-------------| 260 + | `BACKFILL_MAX_PDS_WORKERS` | 10 | Max concurrent PDS endpoints being processed | 261 + | `BACKFILL_PDS_CONCURRENCY` | 4 | Max concurrent repo fetches per PDS | 262 + | `BACKFILL_MAX_HTTP_CONCURRENT` | 50 | Global HTTP request limit | 263 + | `BACKFILL_REPO_TIMEOUT` | 60 | Timeout per repo fetch (seconds) | 264 + 265 + ### Recommended Settings by VPS Size 266 + 267 + **1GB RAM (e.g., Railway starter):** 268 + ``` 269 + BACKFILL_MAX_PDS_WORKERS=8 270 + BACKFILL_PDS_CONCURRENCY=2 271 + BACKFILL_MAX_HTTP_CONCURRENT=30 272 + ``` 273 + 274 + **2GB RAM:** 275 + ``` 276 + BACKFILL_MAX_PDS_WORKERS=15 277 + BACKFILL_PDS_CONCURRENCY=4 278 + BACKFILL_MAX_HTTP_CONCURRENT=50 279 + ``` 280 + 281 + **4GB+ RAM:** 282 + ``` 283 + BACKFILL_MAX_PDS_WORKERS=25 284 + BACKFILL_PDS_CONCURRENCY=6 285 + BACKFILL_MAX_HTTP_CONCURRENT=100 286 + ``` 287 + 254 288 ## Resource Requirements 255 289 256 290 **Minimum**:

+219 -72

server/src/backfill.gleam

··· 50 50 plc_directory_url: String, 51 51 index_actors: Bool, 52 52 max_concurrent_per_pds: Int, 53 + max_pds_workers: Int, 54 + max_http_concurrent: Int, 55 + repo_fetch_timeout_ms: Int, 53 56 did_cache: Option(Subject(did_cache.Message)), 54 57 ) 55 58 } ··· 62 65 Error(_) -> "https://plc.directory" 63 66 } 64 67 65 - // Get max concurrent per PDS from environment or use default of 6 68 + // Get max concurrent per PDS from environment or use default of 4 66 69 let max_pds_concurrent = case envoy.get("BACKFILL_PDS_CONCURRENCY") { 67 70 Ok(val) -> { 68 71 case int.parse(val) { 69 72 Ok(n) -> n 70 - Error(_) -> 6 73 + Error(_) -> 4 74 + } 75 + } 76 + Error(_) -> 4 77 + } 78 + 79 + // Get max PDS workers from environment or use default of 10 80 + let max_pds_workers = case envoy.get("BACKFILL_MAX_PDS_WORKERS") { 81 + Ok(val) -> { 82 + case int.parse(val) { 83 + Ok(n) -> n 84 + Error(_) -> 10 85 + } 86 + } 87 + Error(_) -> 10 88 + } 89 + 90 + // Get max HTTP concurrent from environment or use default of 50 91 + let max_http = case envoy.get("BACKFILL_MAX_HTTP_CONCURRENT") { 92 + Ok(val) -> { 93 + case int.parse(val) { 94 + Ok(n) -> n 95 + Error(_) -> 50 71 96 } 72 97 } 73 - Error(_) -> 6 98 + Error(_) -> 50 99 + } 100 + 101 + // Get repo fetch timeout from environment or use default of 60s 102 + let repo_timeout = case envoy.get("BACKFILL_REPO_TIMEOUT") { 103 + Ok(val) -> { 104 + case int.parse(val) { 105 + Ok(n) -> n * 1000 106 + Error(_) -> 60_000 107 + } 108 + } 109 + Error(_) -> 60_000 74 110 } 75 111 76 - // Configure hackney pool for better connection reuse 77 - // We'll call directly into Erlang to set up the pool 78 - configure_hackney_pool() 112 + // Configure hackney pool with the configured HTTP limit 113 + configure_hackney_pool(max_http) 79 114 80 115 BackfillConfig( 81 116 plc_directory_url: plc_url, 82 117 index_actors: True, 83 118 max_concurrent_per_pds: max_pds_concurrent, 119 + max_pds_workers: max_pds_workers, 120 + max_http_concurrent: max_http, 121 + repo_fetch_timeout_ms: repo_timeout, 84 122 did_cache: None, 85 123 ) 86 124 } ··· 91 129 BackfillConfig(..config, did_cache: Some(cache)) 92 130 } 93 131 94 - /// Configure hackney connection pool with higher limits 95 - /// Called via Erlang FFI to avoid atom conversion issues 132 + /// Configure hackney connection pool with specified limits 96 133 @external(erlang, "backfill_ffi", "configure_pool") 97 - fn configure_hackney_pool() -> Nil 134 + fn configure_hackney_pool(max_concurrent: Int) -> Nil 98 135 99 136 /// Acquire a permit from the global HTTP semaphore 100 137 /// Blocks if at the concurrent request limit (150) ··· 697 734 max_concurrent: Int, 698 735 conn: sqlight.Connection, 699 736 validation_ctx: Option(honk.ValidationContext), 737 + timeout_ms: Int, 700 738 reply_to: Subject(Int), 701 739 ) -> Nil { 702 740 logging.log( ··· 704 742 "[backfill] PDS worker starting for " 705 743 <> pds_url 706 744 <> " with " 707 - <> string.inspect(list.length(repos)) 745 + <> int.to_string(list.length(repos)) 708 746 <> " repos", 709 747 ) 710 748 let subject = process.new_subject() ··· 737 775 collections, 738 776 conn, 739 777 validation_ctx, 778 + timeout_ms, 740 779 0, 741 780 ) 742 781 ··· 745 784 "[backfill] PDS worker finished for " 746 785 <> pds_url 747 786 <> " with " 748 - <> string.inspect(total_count) 787 + <> int.to_string(total_count) 749 788 <> " total records", 750 789 ) 751 790 process.send(reply_to, total_count) ··· 760 799 collections: List(String), 761 800 conn: sqlight.Connection, 762 801 validation_ctx: Option(honk.ValidationContext), 802 + timeout_ms: Int, 763 803 total: Int, 764 804 ) -> Int { 765 805 case in_flight { 766 806 0 -> total 767 807 _ -> { 768 - // 5 minute timeout per CAR worker (validation adds processing time for large repos) 769 - case process.receive(subject, 300_000) { 808 + case process.receive(subject, timeout_ms) { 770 809 Ok(count) -> { 771 810 let new_total = total + count 772 811 case remaining { ··· 789 828 collections, 790 829 conn, 791 830 validation_ctx, 831 + timeout_ms, 792 832 new_total, 793 833 ) 794 834 } ··· 801 841 collections, 802 842 conn, 803 843 validation_ctx, 844 + timeout_ms, 804 845 new_total, 805 846 ) 806 847 } ··· 811 852 "[backfill] Timeout waiting for CAR worker on " 812 853 <> pds_url 813 854 <> " (in_flight: " 814 - <> string.inspect(in_flight) 855 + <> int.to_string(in_flight) 815 856 <> ", remaining: " 816 - <> string.inspect(list.length(remaining)) 857 + <> int.to_string(list.length(remaining)) 817 858 <> ")", 818 859 ) 819 860 sliding_window_car( ··· 824 865 collections, 825 866 conn, 826 867 validation_ctx, 868 + timeout_ms, 827 869 total, 828 870 ) 829 871 } ··· 832 874 } 833 875 } 834 876 877 + /// Sliding window for PDS worker processing 878 + /// Limits how many PDS endpoints are processed concurrently 879 + fn sliding_window_pds( 880 + remaining: List(#(String, List(#(String, String)))), 881 + subject: Subject(Int), 882 + in_flight: Int, 883 + collections: List(String), 884 + max_concurrent_per_pds: Int, 885 + conn: sqlight.Connection, 886 + validation_ctx: Option(honk.ValidationContext), 887 + timeout_ms: Int, 888 + total: Int, 889 + pds_count: Int, 890 + completed: Int, 891 + ) -> Int { 892 + case in_flight { 893 + 0 -> total 894 + _ -> { 895 + // 5 minute timeout per PDS worker 896 + case process.receive(subject, 300_000) { 897 + Ok(count) -> { 898 + let new_total = total + count 899 + let new_completed = completed + 1 900 + logging.log( 901 + logging.Info, 902 + "[backfill] PDS worker " 903 + <> int.to_string(new_completed) 904 + <> "/" 905 + <> int.to_string(pds_count) 906 + <> " done (" 907 + <> int.to_string(count) 908 + <> " records)", 909 + ) 910 + case remaining { 911 + [#(pds_url, repo_pairs), ..rest] -> { 912 + let pds_repos = 913 + repo_pairs 914 + |> list.map(fn(pair) { 915 + let #(_pds, repo) = pair 916 + repo 917 + }) 918 + process.spawn_unlinked(fn() { 919 + pds_worker_car( 920 + pds_url, 921 + pds_repos, 922 + collections, 923 + max_concurrent_per_pds, 924 + conn, 925 + validation_ctx, 926 + timeout_ms, 927 + subject, 928 + ) 929 + }) 930 + sliding_window_pds( 931 + rest, 932 + subject, 933 + in_flight, 934 + collections, 935 + max_concurrent_per_pds, 936 + conn, 937 + validation_ctx, 938 + timeout_ms, 939 + new_total, 940 + pds_count, 941 + new_completed, 942 + ) 943 + } 944 + [] -> 945 + sliding_window_pds( 946 + [], 947 + subject, 948 + in_flight - 1, 949 + collections, 950 + max_concurrent_per_pds, 951 + conn, 952 + validation_ctx, 953 + timeout_ms, 954 + new_total, 955 + pds_count, 956 + new_completed, 957 + ) 958 + } 959 + } 960 + Error(_) -> { 961 + logging.log( 962 + logging.Warning, 963 + "[backfill] PDS worker timed out (in_flight: " 964 + <> int.to_string(in_flight) 965 + <> ", remaining: " 966 + <> int.to_string(list.length(remaining)) 967 + <> ")", 968 + ) 969 + sliding_window_pds( 970 + remaining, 971 + subject, 972 + in_flight - 1, 973 + collections, 974 + max_concurrent_per_pds, 975 + conn, 976 + validation_ctx, 977 + timeout_ms, 978 + total, 979 + pds_count, 980 + completed, 981 + ) 982 + } 983 + } 984 + } 985 + } 986 + } 987 + 835 988 /// CAR-based streaming: fetch repos as CAR files and filter locally 836 989 /// One request per repo instead of one per (repo, collection) 837 990 pub fn get_records_for_repos_car( ··· 864 1017 pds 865 1018 }) 866 1019 867 - // Spawn one worker per PDS 868 - let subject = process.new_subject() 869 1020 let pds_entries = dict.to_list(repos_by_pds) 870 1021 let pds_count = list.length(pds_entries) 871 1022 872 - let _pds_workers = 873 - pds_entries 874 - |> list.map(fn(pds_entry) { 875 - let #(pds_url, repo_pairs) = pds_entry 876 - let pds_repos = 877 - repo_pairs 878 - |> list.map(fn(pair) { 879 - let #(_pds, repo) = pair 880 - repo 881 - }) 1023 + logging.log( 1024 + logging.Info, 1025 + "[backfill] Processing " 1026 + <> int.to_string(pds_count) 1027 + <> " PDS endpoints (max " 1028 + <> int.to_string(config.max_pds_workers) 1029 + <> " concurrent)...", 1030 + ) 1031 + 1032 + // Use sliding window to limit concurrent PDS workers 1033 + let subject = process.new_subject() 1034 + let #(initial_pds, remaining_pds) = 1035 + list.split(pds_entries, config.max_pds_workers) 1036 + let initial_count = list.length(initial_pds) 882 1037 883 - process.spawn_unlinked(fn() { 884 - pds_worker_car( 885 - pds_url, 886 - pds_repos, 887 - collections, 888 - config.max_concurrent_per_pds, 889 - conn, 890 - validation_ctx, 891 - subject, 892 - ) 1038 + // Spawn initial batch of PDS workers 1039 + list.each(initial_pds, fn(pds_entry) { 1040 + let #(pds_url, repo_pairs) = pds_entry 1041 + let pds_repos = 1042 + repo_pairs 1043 + |> list.map(fn(pair) { 1044 + let #(_pds, repo) = pair 1045 + repo 893 1046 }) 1047 + 1048 + process.spawn_unlinked(fn() { 1049 + pds_worker_car( 1050 + pds_url, 1051 + pds_repos, 1052 + collections, 1053 + config.max_concurrent_per_pds, 1054 + conn, 1055 + validation_ctx, 1056 + config.repo_fetch_timeout_ms, 1057 + subject, 1058 + ) 894 1059 }) 1060 + }) 895 1061 896 - // Collect counts from all PDS workers 897 - logging.log( 898 - logging.Info, 899 - "[backfill] Waiting for " <> string.inspect(pds_count) <> " PDS workers...", 900 - ) 1062 + // Process remaining with sliding window 901 1063 let result = 902 - list.range(1, pds_count) 903 - |> list.fold(0, fn(acc, i) { 904 - case process.receive(subject, 300_000) { 905 - Ok(count) -> { 906 - logging.log( 907 - logging.Info, 908 - "[backfill] PDS worker " 909 - <> string.inspect(i) 910 - <> "/" 911 - <> string.inspect(pds_count) 912 - <> " done (" 913 - <> string.inspect(count) 914 - <> " records)", 915 - ) 916 - acc + count 917 - } 918 - Error(_) -> { 919 - logging.log( 920 - logging.Warning, 921 - "[backfill] PDS worker " 922 - <> string.inspect(i) 923 - <> "/" 924 - <> string.inspect(pds_count) 925 - <> " timed out", 926 - ) 927 - acc 928 - } 929 - } 930 - }) 1064 + sliding_window_pds( 1065 + remaining_pds, 1066 + subject, 1067 + initial_count, 1068 + collections, 1069 + config.max_concurrent_per_pds, 1070 + conn, 1071 + validation_ctx, 1072 + config.repo_fetch_timeout_ms, 1073 + 0, 1074 + pds_count, 1075 + 0, 1076 + ) 1077 + 931 1078 logging.log( 932 1079 logging.Info, 933 1080 "[backfill] All PDS workers complete, total: " 934 - <> string.inspect(result) 1081 + <> int.to_string(result) 935 1082 <> " records", 936 1083 ) 937 1084 result

+12 -27

server/src/backfill_ffi.erl

··· 1 1 -module(backfill_ffi). 2 - -export([configure_pool/0, init_semaphore/0, acquire_permit/0, release_permit/0, rescue/1, monotonic_now/0, elapsed_ms/1]). 3 - 4 - %% Maximum concurrent HTTP requests for backfill 5 - -define(MAX_CONCURRENT, 150). 2 + -export([configure_pool/1, init_semaphore/1, acquire_permit/0, release_permit/0, rescue/1, monotonic_now/0, elapsed_ms/1]). 6 3 7 - %% Configure hackney connection pool with higher limits 8 - configure_pool() -> 4 + %% Configure hackney connection pool with specified limits 5 + configure_pool(MaxConcurrent) -> 9 6 %% Suppress SSL handshake error notices (TLS alerts from bad certificates) 10 - %% These clutter the logs when connecting to self-hosted PDS with bad certs 11 - %% Set both the ssl application log level and logger level 12 7 application:set_env(ssl, log_level, error), 13 8 logger:set_application_level(ssl, error), 14 9 15 10 %% Stop the default pool if it exists (ignore errors) 16 11 _ = hackney_pool:stop_pool(default), 17 12 18 - %% Start pool with increased connection limits and timeouts 19 - %% timeout: how long to keep connections alive in the pool (ms) 20 - %% max_connections: maximum number of connections in the pool 21 - %% recv_timeout: how long to wait for response data (ms) 13 + %% Start pool with configured connection limits 22 14 Options = [ 23 15 {timeout, 150000}, 24 - {max_connections, 300}, 16 + {max_connections, MaxConcurrent * 2}, 25 17 {recv_timeout, 30000} 26 18 ], 27 19 28 - %% Start the pool (this will create it if it doesn't exist) 29 20 case hackney_pool:start_pool(default, Options) of 30 21 ok -> ok; 31 22 {error, {already_started, _}} -> ok; ··· 33 24 end, 34 25 35 26 %% Initialize the semaphore for rate limiting 36 - init_semaphore(), 27 + init_semaphore(MaxConcurrent), 37 28 38 - %% Return nil (atom 'nil' in Gleam) 39 29 nil. 40 30 41 31 %% Initialize the global semaphore using atomics 42 - %% Uses persistent_term for fast global access 43 - init_semaphore() -> 44 - case persistent_term:get(backfill_semaphore, undefined) of 45 - undefined -> 46 - Ref = atomics:new(1, [{signed, true}]), 47 - atomics:put(Ref, 1, ?MAX_CONCURRENT), 48 - persistent_term:put(backfill_semaphore, Ref); 49 - _ -> 50 - %% Already initialized 51 - ok 52 - end. 32 + init_semaphore(MaxConcurrent) -> 33 + %% Always recreate to pick up new limit 34 + Ref = atomics:new(1, [{signed, true}]), 35 + atomics:put(Ref, 1, MaxConcurrent), 36 + persistent_term:put(backfill_semaphore, Ref), 37 + ok. 53 38 54 39 %% Acquire a permit from the semaphore 55 40 %% Blocks (with sleep) if no permits available