···5050 plc_directory_url: String,
5151 index_actors: Bool,
5252 max_concurrent_per_pds: Int,
5353+ max_pds_workers: Int,
5454+ max_http_concurrent: Int,
5555+ repo_fetch_timeout_ms: Int,
5356 did_cache: Option(Subject(did_cache.Message)),
5457 )
5558}
···6265 Error(_) -> "https://plc.directory"
6366 }
64676565- // Get max concurrent per PDS from environment or use default of 6
6868+ // Get max concurrent per PDS from environment or use default of 4
6669 let max_pds_concurrent = case envoy.get("BACKFILL_PDS_CONCURRENCY") {
6770 Ok(val) -> {
6871 case int.parse(val) {
6972 Ok(n) -> n
7070- Error(_) -> 6
7373+ Error(_) -> 4
7474+ }
7575+ }
7676+ Error(_) -> 4
7777+ }
7878+7979+ // Get max PDS workers from environment or use default of 10
8080+ let max_pds_workers = case envoy.get("BACKFILL_MAX_PDS_WORKERS") {
8181+ Ok(val) -> {
8282+ case int.parse(val) {
8383+ Ok(n) -> n
8484+ Error(_) -> 10
8585+ }
8686+ }
8787+ Error(_) -> 10
8888+ }
8989+9090+ // Get max HTTP concurrent from environment or use default of 50
9191+ let max_http = case envoy.get("BACKFILL_MAX_HTTP_CONCURRENT") {
9292+ Ok(val) -> {
9393+ case int.parse(val) {
9494+ Ok(n) -> n
9595+ Error(_) -> 50
7196 }
7297 }
7373- Error(_) -> 6
9898+ Error(_) -> 50
9999+ }
100100+101101+ // Get repo fetch timeout from environment or use default of 60s
102102+ let repo_timeout = case envoy.get("BACKFILL_REPO_TIMEOUT") {
103103+ Ok(val) -> {
104104+ case int.parse(val) {
105105+ Ok(n) -> n * 1000
106106+ Error(_) -> 60_000
107107+ }
108108+ }
109109+ Error(_) -> 60_000
74110 }
751117676- // Configure hackney pool for better connection reuse
7777- // We'll call directly into Erlang to set up the pool
7878- configure_hackney_pool()
112112+ // Configure hackney pool with the configured HTTP limit
113113+ configure_hackney_pool(max_http)
7911480115 BackfillConfig(
81116 plc_directory_url: plc_url,
82117 index_actors: True,
83118 max_concurrent_per_pds: max_pds_concurrent,
119119+ max_pds_workers: max_pds_workers,
120120+ max_http_concurrent: max_http,
121121+ repo_fetch_timeout_ms: repo_timeout,
84122 did_cache: None,
85123 )
86124}
···91129 BackfillConfig(..config, did_cache: Some(cache))
92130}
931319494-/// Configure hackney connection pool with higher limits
9595-/// Called via Erlang FFI to avoid atom conversion issues
132132+/// Configure hackney connection pool with specified limits
96133@external(erlang, "backfill_ffi", "configure_pool")
9797-fn configure_hackney_pool() -> Nil
134134+fn configure_hackney_pool(max_concurrent: Int) -> Nil
9813599136/// Acquire a permit from the global HTTP semaphore
100137/// Blocks if at the concurrent request limit (150)
···697734 max_concurrent: Int,
698735 conn: sqlight.Connection,
699736 validation_ctx: Option(honk.ValidationContext),
737737+ timeout_ms: Int,
700738 reply_to: Subject(Int),
701739) -> Nil {
702740 logging.log(
···704742 "[backfill] PDS worker starting for "
705743 <> pds_url
706744 <> " with "
707707- <> string.inspect(list.length(repos))
745745+ <> int.to_string(list.length(repos))
708746 <> " repos",
709747 )
710748 let subject = process.new_subject()
···737775 collections,
738776 conn,
739777 validation_ctx,
778778+ timeout_ms,
740779 0,
741780 )
742781···745784 "[backfill] PDS worker finished for "
746785 <> pds_url
747786 <> " with "
748748- <> string.inspect(total_count)
787787+ <> int.to_string(total_count)
749788 <> " total records",
750789 )
751790 process.send(reply_to, total_count)
···760799 collections: List(String),
761800 conn: sqlight.Connection,
762801 validation_ctx: Option(honk.ValidationContext),
802802+ timeout_ms: Int,
763803 total: Int,
764804) -> Int {
765805 case in_flight {
766806 0 -> total
767807 _ -> {
768768- // 5 minute timeout per CAR worker (validation adds processing time for large repos)
769769- case process.receive(subject, 300_000) {
808808+ case process.receive(subject, timeout_ms) {
770809 Ok(count) -> {
771810 let new_total = total + count
772811 case remaining {
···789828 collections,
790829 conn,
791830 validation_ctx,
831831+ timeout_ms,
792832 new_total,
793833 )
794834 }
···801841 collections,
802842 conn,
803843 validation_ctx,
844844+ timeout_ms,
804845 new_total,
805846 )
806847 }
···811852 "[backfill] Timeout waiting for CAR worker on "
812853 <> pds_url
813854 <> " (in_flight: "
814814- <> string.inspect(in_flight)
855855+ <> int.to_string(in_flight)
815856 <> ", remaining: "
816816- <> string.inspect(list.length(remaining))
857857+ <> int.to_string(list.length(remaining))
817858 <> ")",
818859 )
819860 sliding_window_car(
···824865 collections,
825866 conn,
826867 validation_ctx,
868868+ timeout_ms,
827869 total,
828870 )
829871 }
···832874 }
833875}
834876877877+/// Sliding window for PDS worker processing
878878+/// Limits how many PDS endpoints are processed concurrently
879879+fn sliding_window_pds(
880880+ remaining: List(#(String, List(#(String, String)))),
881881+ subject: Subject(Int),
882882+ in_flight: Int,
883883+ collections: List(String),
884884+ max_concurrent_per_pds: Int,
885885+ conn: sqlight.Connection,
886886+ validation_ctx: Option(honk.ValidationContext),
887887+ timeout_ms: Int,
888888+ total: Int,
889889+ pds_count: Int,
890890+ completed: Int,
891891+) -> Int {
892892+ case in_flight {
893893+ 0 -> total
894894+ _ -> {
895895+ // 5 minute timeout per PDS worker
896896+ case process.receive(subject, 300_000) {
897897+ Ok(count) -> {
898898+ let new_total = total + count
899899+ let new_completed = completed + 1
900900+ logging.log(
901901+ logging.Info,
902902+ "[backfill] PDS worker "
903903+ <> int.to_string(new_completed)
904904+ <> "/"
905905+ <> int.to_string(pds_count)
906906+ <> " done ("
907907+ <> int.to_string(count)
908908+ <> " records)",
909909+ )
910910+ case remaining {
911911+ [#(pds_url, repo_pairs), ..rest] -> {
912912+ let pds_repos =
913913+ repo_pairs
914914+ |> list.map(fn(pair) {
915915+ let #(_pds, repo) = pair
916916+ repo
917917+ })
918918+ process.spawn_unlinked(fn() {
919919+ pds_worker_car(
920920+ pds_url,
921921+ pds_repos,
922922+ collections,
923923+ max_concurrent_per_pds,
924924+ conn,
925925+ validation_ctx,
926926+ timeout_ms,
927927+ subject,
928928+ )
929929+ })
930930+ sliding_window_pds(
931931+ rest,
932932+ subject,
933933+ in_flight,
934934+ collections,
935935+ max_concurrent_per_pds,
936936+ conn,
937937+ validation_ctx,
938938+ timeout_ms,
939939+ new_total,
940940+ pds_count,
941941+ new_completed,
942942+ )
943943+ }
944944+ [] ->
945945+ sliding_window_pds(
946946+ [],
947947+ subject,
948948+ in_flight - 1,
949949+ collections,
950950+ max_concurrent_per_pds,
951951+ conn,
952952+ validation_ctx,
953953+ timeout_ms,
954954+ new_total,
955955+ pds_count,
956956+ new_completed,
957957+ )
958958+ }
959959+ }
960960+ Error(_) -> {
961961+ logging.log(
962962+ logging.Warning,
963963+ "[backfill] PDS worker timed out (in_flight: "
964964+ <> int.to_string(in_flight)
965965+ <> ", remaining: "
966966+ <> int.to_string(list.length(remaining))
967967+ <> ")",
968968+ )
969969+ sliding_window_pds(
970970+ remaining,
971971+ subject,
972972+ in_flight - 1,
973973+ collections,
974974+ max_concurrent_per_pds,
975975+ conn,
976976+ validation_ctx,
977977+ timeout_ms,
978978+ total,
979979+ pds_count,
980980+ completed,
981981+ )
982982+ }
983983+ }
984984+ }
985985+ }
986986+}
987987+835988/// CAR-based streaming: fetch repos as CAR files and filter locally
836989/// One request per repo instead of one per (repo, collection)
837990pub fn get_records_for_repos_car(
···8641017 pds
8651018 })
8661019867867- // Spawn one worker per PDS
868868- let subject = process.new_subject()
8691020 let pds_entries = dict.to_list(repos_by_pds)
8701021 let pds_count = list.length(pds_entries)
8711022872872- let _pds_workers =
873873- pds_entries
874874- |> list.map(fn(pds_entry) {
875875- let #(pds_url, repo_pairs) = pds_entry
876876- let pds_repos =
877877- repo_pairs
878878- |> list.map(fn(pair) {
879879- let #(_pds, repo) = pair
880880- repo
881881- })
10231023+ logging.log(
10241024+ logging.Info,
10251025+ "[backfill] Processing "
10261026+ <> int.to_string(pds_count)
10271027+ <> " PDS endpoints (max "
10281028+ <> int.to_string(config.max_pds_workers)
10291029+ <> " concurrent)...",
10301030+ )
10311031+10321032+ // Use sliding window to limit concurrent PDS workers
10331033+ let subject = process.new_subject()
10341034+ let #(initial_pds, remaining_pds) =
10351035+ list.split(pds_entries, config.max_pds_workers)
10361036+ let initial_count = list.length(initial_pds)
8821037883883- process.spawn_unlinked(fn() {
884884- pds_worker_car(
885885- pds_url,
886886- pds_repos,
887887- collections,
888888- config.max_concurrent_per_pds,
889889- conn,
890890- validation_ctx,
891891- subject,
892892- )
10381038+ // Spawn initial batch of PDS workers
10391039+ list.each(initial_pds, fn(pds_entry) {
10401040+ let #(pds_url, repo_pairs) = pds_entry
10411041+ let pds_repos =
10421042+ repo_pairs
10431043+ |> list.map(fn(pair) {
10441044+ let #(_pds, repo) = pair
10451045+ repo
8931046 })
10471047+10481048+ process.spawn_unlinked(fn() {
10491049+ pds_worker_car(
10501050+ pds_url,
10511051+ pds_repos,
10521052+ collections,
10531053+ config.max_concurrent_per_pds,
10541054+ conn,
10551055+ validation_ctx,
10561056+ config.repo_fetch_timeout_ms,
10571057+ subject,
10581058+ )
8941059 })
10601060+ })
8951061896896- // Collect counts from all PDS workers
897897- logging.log(
898898- logging.Info,
899899- "[backfill] Waiting for " <> string.inspect(pds_count) <> " PDS workers...",
900900- )
10621062+ // Process remaining with sliding window
9011063 let result =
902902- list.range(1, pds_count)
903903- |> list.fold(0, fn(acc, i) {
904904- case process.receive(subject, 300_000) {
905905- Ok(count) -> {
906906- logging.log(
907907- logging.Info,
908908- "[backfill] PDS worker "
909909- <> string.inspect(i)
910910- <> "/"
911911- <> string.inspect(pds_count)
912912- <> " done ("
913913- <> string.inspect(count)
914914- <> " records)",
915915- )
916916- acc + count
917917- }
918918- Error(_) -> {
919919- logging.log(
920920- logging.Warning,
921921- "[backfill] PDS worker "
922922- <> string.inspect(i)
923923- <> "/"
924924- <> string.inspect(pds_count)
925925- <> " timed out",
926926- )
927927- acc
928928- }
929929- }
930930- })
10641064+ sliding_window_pds(
10651065+ remaining_pds,
10661066+ subject,
10671067+ initial_count,
10681068+ collections,
10691069+ config.max_concurrent_per_pds,
10701070+ conn,
10711071+ validation_ctx,
10721072+ config.repo_fetch_timeout_ms,
10731073+ 0,
10741074+ pds_count,
10751075+ 0,
10761076+ )
10771077+9311078 logging.log(
9321079 logging.Info,
9331080 "[backfill] All PDS workers complete, total: "
934934- <> string.inspect(result)
10811081+ <> int.to_string(result)
9351082 <> " records",
9361083 )
9371084 result
+12-27
server/src/backfill_ffi.erl
···11-module(backfill_ffi).
22--export([configure_pool/0, init_semaphore/0, acquire_permit/0, release_permit/0, rescue/1, monotonic_now/0, elapsed_ms/1]).
33-44-%% Maximum concurrent HTTP requests for backfill
55--define(MAX_CONCURRENT, 150).
22+-export([configure_pool/1, init_semaphore/1, acquire_permit/0, release_permit/0, rescue/1, monotonic_now/0, elapsed_ms/1]).
6377-%% Configure hackney connection pool with higher limits
88-configure_pool() ->
44+%% Configure hackney connection pool with specified limits
55+configure_pool(MaxConcurrent) ->
96 %% Suppress SSL handshake error notices (TLS alerts from bad certificates)
1010- %% These clutter the logs when connecting to self-hosted PDS with bad certs
1111- %% Set both the ssl application log level and logger level
127 application:set_env(ssl, log_level, error),
138 logger:set_application_level(ssl, error),
1491510 %% Stop the default pool if it exists (ignore errors)
1611 _ = hackney_pool:stop_pool(default),
17121818- %% Start pool with increased connection limits and timeouts
1919- %% timeout: how long to keep connections alive in the pool (ms)
2020- %% max_connections: maximum number of connections in the pool
2121- %% recv_timeout: how long to wait for response data (ms)
1313+ %% Start pool with configured connection limits
2214 Options = [
2315 {timeout, 150000},
2424- {max_connections, 300},
1616+ {max_connections, MaxConcurrent * 2},
2517 {recv_timeout, 30000}
2618 ],
27192828- %% Start the pool (this will create it if it doesn't exist)
2920 case hackney_pool:start_pool(default, Options) of
3021 ok -> ok;
3122 {error, {already_started, _}} -> ok;
···3324 end,
34253526 %% Initialize the semaphore for rate limiting
3636- init_semaphore(),
2727+ init_semaphore(MaxConcurrent),
37283838- %% Return nil (atom 'nil' in Gleam)
3929 nil.
40304131%% Initialize the global semaphore using atomics
4242-%% Uses persistent_term for fast global access
4343-init_semaphore() ->
4444- case persistent_term:get(backfill_semaphore, undefined) of
4545- undefined ->
4646- Ref = atomics:new(1, [{signed, true}]),
4747- atomics:put(Ref, 1, ?MAX_CONCURRENT),
4848- persistent_term:put(backfill_semaphore, Ref);
4949- _ ->
5050- %% Already initialized
5151- ok
5252- end.
3232+init_semaphore(MaxConcurrent) ->
3333+ %% Always recreate to pick up new limit
3434+ Ref = atomics:new(1, [{signed, true}]),
3535+ atomics:put(Ref, 1, MaxConcurrent),
3636+ persistent_term:put(backfill_semaphore, Ref),
3737+ ok.
53385439%% Acquire a permit from the semaphore
5540%% Blocks (with sleep) if no permits available