tangled
alpha
login
or
join now
slices.network
/
slices
137
fork
atom
Highly ambitious ATProtocol AppView service and sdks
137
fork
atom
overview
issues
10
pulls
3
pipelines
improve sync performance to prevent ooms
chadtmiller.com
5 months ago
004b6609
7d49ce5b
+225
-74
1 changed file
expand all
collapse all
unified
split
api
src
sync.rs
+225
-74
api/src/sync.rs
···
2
//!
3
//! This module handles backfilling and syncing data from the ATProto network via the relay endpoint.
4
//! It provides:
5
-
//! - Backfilling collections for slices from the relay
6
-
//! - User-specific collection syncing for login flows
7
-
//! - Rate-limited PDS requests to avoid overwhelming servers
8
-
//! - DID resolution with caching
9
-
//! - Record validation against Lexicon schemas
10
-
//! - Actor indexing
0
11
12
use atproto_identity::resolve::{HickoryDnsResolver, resolve_subject};
13
use chrono::Utc;
0
14
use reqwest::Client;
15
use serde::{Deserialize, Serialize};
16
use serde_json::Value;
0
17
use tokio::time::{Duration, timeout};
18
use tracing::{debug, error, info, warn};
19
···
113
relay_endpoint: String,
114
cache: Arc<Mutex<SliceCache>>,
115
) -> Self {
0
0
0
0
0
0
0
0
0
0
116
Self {
117
-
client: Client::new(),
118
database,
119
relay_endpoint,
120
cache: Some(cache),
···
133
user_did: String,
134
cache: Arc<Mutex<SliceCache>>,
135
) -> Self {
0
0
0
0
0
0
0
0
0
0
136
Self {
137
-
client: Client::new(),
138
database,
139
relay_endpoint,
140
cache: Some(cache),
···
176
///
177
/// Tuple of (repos_processed, records_synced)
178
///
179
-
/// # Rate Limiting
180
///
181
-
/// Requests are grouped by PDS server and limited to 8 concurrent requests per PDS
182
-
/// to avoid overwhelming individual servers.
0
0
183
pub async fn backfill_collections(
184
&self,
185
slice_uri: &str,
···
299
300
info!("Starting sync for {} repositories...", valid_repos.len());
301
302
-
// Group requests by PDS server to implement rate limiting
0
303
let mut requests_by_pds: std::collections::HashMap<String, Vec<(String, String)>> =
304
-
std::collections::HashMap::new();
305
306
for repo in &valid_repos {
307
if let Some(atp_data) = atp_map.get(repo) {
···
321
requests_by_pds.values().map(|v| v.len()).sum::<usize>()
322
);
323
324
-
// Process each PDS server with limited concurrency to avoid overwhelming them
0
0
325
let mut fetch_tasks = Vec::new();
326
-
const MAX_CONCURRENT_PER_PDS: usize = 8;
327
328
for (_pds_url, repo_collections) in requests_by_pds {
329
let sync_service = self.clone();
···
399
}
400
401
// Collect all results
402
-
let mut all_records = Vec::new();
403
let mut successful_tasks = 0;
404
let mut failed_tasks = 0;
405
···
416
}
417
};
418
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
419
// Process results from each PDS server
0
0
420
for pds_task in fetch_tasks {
421
match pds_task.await {
422
Ok(pds_results) => {
···
438
// Validate each record if we have lexicons
439
else if let Some(ref lexicons) = lexicons {
440
let mut validation_errors = Vec::new();
441
-
for record in records {
442
-
match slices_lexicon::validate_record(
443
-
lexicons.clone(),
444
-
&collection,
445
-
record.json.clone(),
446
-
) {
447
-
Ok(_) => {
448
-
validated_records.push(record);
449
-
}
450
-
Err(e) => {
451
-
let error_msg = format!(
452
-
"Validation failed for record {} from {}: {}",
453
-
record.uri, repo, e
454
-
);
455
-
warn!("{}", error_msg);
456
-
validation_errors.push(json!({
457
-
"uri": record.uri,
458
-
"error": e.to_string()
459
-
}));
0
0
0
0
0
460
461
-
// Log individual validation failures
462
-
self.log_with_context(
463
-
slice_uri,
464
-
LogLevel::Warn,
465
-
&error_msg,
466
-
Some(json!({
467
-
"repo": repo,
468
-
"collection": collection,
469
-
"record_uri": record.uri,
470
-
"validation_error": e.to_string()
471
-
})),
472
-
);
0
473
}
474
}
475
}
···
525
);
526
}
527
528
-
all_records.extend(validated_records);
0
529
successful_tasks += 1;
530
}
531
Err(_) => {
···
539
failed_tasks += 1;
540
}
541
}
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
542
}
543
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
544
info!(
545
"Debug: {} successful tasks, {} failed tasks",
546
successful_tasks, failed_tasks
547
);
548
549
-
let total_records = all_records.len() as i64;
550
info!(
551
-
"Prepared {} new/changed records for indexing",
552
-
total_records
553
);
554
555
-
// Index actors first (ensuring actor records exist before inserting records)
556
-
info!("Indexing actors...");
557
-
self.index_actors(slice_uri, &valid_repos, &atp_map).await?;
558
-
info!("Indexed {} actors", valid_repos.len());
559
-
560
-
// Batch insert new/changed records
561
-
if !all_records.is_empty() {
562
-
info!("Indexing {} new/changed records...", total_records);
563
-
self.database.batch_insert_records(&all_records).await?;
564
-
} else {
565
-
info!("No new or changed records to index");
566
-
}
567
-
568
info!("Backfill complete!");
569
570
-
Ok((valid_repos.len() as i64, total_records))
571
}
572
573
/// Fetch all repositories that have records in a given collection.
···
697
///
698
/// Only returns new or changed records (compared by CID).
699
/// Uses cursor-based pagination to fetch all records.
0
0
0
0
0
700
async fn fetch_records_for_repo_collection(
701
&self,
702
repo: &str,
···
718
collection
719
);
720
721
-
let mut records = Vec::new();
0
0
722
let mut cursor: Option<String> = None;
723
let mut fetched_count = 0;
724
let mut skipped_count = 0;
···
785
}
786
787
// Record is new or changed, include it
0
788
let record = Record {
789
uri: atproto_record.uri,
790
cid: atproto_record.cid,
···
836
/// Resolve ATP data (DID, PDS, handle) for multiple repos.
837
///
838
/// Returns a map of DID -> AtpData. Failed resolutions are logged but don't fail the operation.
0
0
0
0
0
839
async fn get_atp_map_for_repos(
840
&self,
841
repos: &[String],
842
) -> Result<std::collections::HashMap<String, AtpData>, SyncError> {
843
-
let mut atp_map = std::collections::HashMap::new();
0
0
0
0
844
845
-
for repo in repos {
846
-
match self.resolve_atp_data(repo).await {
847
-
Ok(atp_data) => {
848
-
atp_map.insert(atp_data.did.clone(), atp_data);
849
-
}
850
-
Err(e) => {
851
-
warn!("Failed to resolve ATP data for {}: {:?}", repo, e);
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
852
}
0
0
0
0
0
0
0
0
0
0
0
0
0
0
853
}
854
}
855
0
856
Ok(atp_map)
857
}
858
···
2
//!
3
//! This module handles backfilling and syncing data from the ATProto network via the relay endpoint.
4
//! It provides:
5
+
//! - Memory-efficient batch processing with streaming writes
6
+
//! - Concurrent database operations using channel-based architecture
7
+
//! - HTTP/2 connection pooling for optimal network utilization
8
+
//! - Rate-limited PDS requests (3 concurrent per server)
9
+
//! - DID resolution with caching and chunked processing
10
+
//! - Parallel record validation against Lexicon schemas
11
+
//! - Actor indexing with pre-allocated data structures
12
13
use atproto_identity::resolve::{HickoryDnsResolver, resolve_subject};
14
use chrono::Utc;
15
+
use futures_util::future;
16
use reqwest::Client;
17
use serde::{Deserialize, Serialize};
18
use serde_json::Value;
19
+
use tokio::sync::mpsc;
20
use tokio::time::{Duration, timeout};
21
use tracing::{debug, error, info, warn};
22
···
116
relay_endpoint: String,
117
cache: Arc<Mutex<SliceCache>>,
118
) -> Self {
119
+
// Create HTTP client with connection pooling and optimized settings
120
+
let client = Client::builder()
121
+
.pool_idle_timeout(Duration::from_secs(90))
122
+
.pool_max_idle_per_host(10)
123
+
.http2_keep_alive_interval(Some(Duration::from_secs(30)))
124
+
.http2_keep_alive_timeout(Duration::from_secs(10))
125
+
.timeout(Duration::from_secs(30))
126
+
.build()
127
+
.unwrap_or_else(|_| Client::new());
128
+
129
Self {
130
+
client,
131
database,
132
relay_endpoint,
133
cache: Some(cache),
···
146
user_did: String,
147
cache: Arc<Mutex<SliceCache>>,
148
) -> Self {
149
+
// Create HTTP client with connection pooling and optimized settings
150
+
let client = Client::builder()
151
+
.pool_idle_timeout(Duration::from_secs(90))
152
+
.pool_max_idle_per_host(10)
153
+
.http2_keep_alive_interval(Some(Duration::from_secs(30)))
154
+
.http2_keep_alive_timeout(Duration::from_secs(10))
155
+
.timeout(Duration::from_secs(30))
156
+
.build()
157
+
.unwrap_or_else(|_| Client::new());
158
+
159
Self {
160
+
client,
161
database,
162
relay_endpoint,
163
cache: Some(cache),
···
199
///
200
/// Tuple of (repos_processed, records_synced)
201
///
202
+
/// # Performance Optimizations
203
///
204
+
/// - Requests are grouped by PDS server with 3 concurrent requests max
205
+
/// - Records are processed in 500-item batches to limit memory usage
206
+
/// - Database writes happen concurrently via channels
207
+
/// - HTTP/2 connection pooling reduces network overhead
208
pub async fn backfill_collections(
209
&self,
210
slice_uri: &str,
···
324
325
info!("Starting sync for {} repositories...", valid_repos.len());
326
327
+
// Group requests by PDS server for rate limiting and connection reuse
328
+
// Pre-allocated capacity avoids HashMap resizing during insertions
329
let mut requests_by_pds: std::collections::HashMap<String, Vec<(String, String)>> =
330
+
std::collections::HashMap::with_capacity(atp_map.len());
331
332
for repo in &valid_repos {
333
if let Some(atp_data) = atp_map.get(repo) {
···
347
requests_by_pds.values().map(|v| v.len()).sum::<usize>()
348
);
349
350
+
// Process each PDS server with limited concurrency
351
+
// 3 concurrent requests balances speed vs memory usage
352
+
// Lower than 3 = too slow, Higher than 3 = memory pressure
353
let mut fetch_tasks = Vec::new();
354
+
const MAX_CONCURRENT_PER_PDS: usize = 3;
355
356
for (_pds_url, repo_collections) in requests_by_pds {
357
let sync_service = self.clone();
···
427
}
428
429
// Collect all results
0
430
let mut successful_tasks = 0;
431
let mut failed_tasks = 0;
432
···
443
}
444
};
445
446
+
// Index actors first (ensuring actor records exist before inserting records)
447
+
info!("Indexing actors...");
448
+
self.index_actors(slice_uri, &valid_repos, &atp_map).await?;
449
+
info!("Indexed {} actors", valid_repos.len());
450
+
451
+
// Set up concurrent database writer using channels
452
+
// This allows fetching to continue while DB writes happen in parallel
453
+
// 500-record batches optimize for memory usage and DB transaction size
454
+
const BATCH_SIZE: usize = 500;
455
+
let (tx, mut rx) = mpsc::channel::<Vec<Record>>(4); // Buffer prevents backpressure
456
+
let database = self.database.clone();
457
+
let total_indexed_records = Arc::new(Mutex::new(0i64));
458
+
459
+
// Spawn database writer task
460
+
let writer_task = tokio::spawn(async move {
461
+
let mut write_count = 0i64;
462
+
while let Some(batch) = rx.recv().await {
463
+
let batch_size = batch.len() as i64;
464
+
match database.batch_insert_records(&batch).await {
465
+
Ok(_) => {
466
+
write_count += batch_size;
467
+
info!("Database writer: Inserted batch of {} records (total: {})", batch_size, write_count);
468
+
}
469
+
Err(e) => {
470
+
error!("Database writer: Failed to insert batch: {}", e);
471
+
return Err(SyncError::Generic(format!("Failed to insert batch: {}", e)));
472
+
}
473
+
}
474
+
}
475
+
Ok(write_count)
476
+
});
477
+
478
// Process results from each PDS server
479
+
let mut batch_buffer = Vec::with_capacity(BATCH_SIZE);
480
+
481
for pds_task in fetch_tasks {
482
match pds_task.await {
483
Ok(pds_results) => {
···
499
// Validate each record if we have lexicons
500
else if let Some(ref lexicons) = lexicons {
501
let mut validation_errors = Vec::new();
502
+
503
+
// Process validations in chunks for better CPU cache locality
504
+
// 50 records per chunk optimizes L2/L3 cache usage
505
+
const VALIDATION_CHUNK_SIZE: usize = 50;
506
+
for chunk in records.chunks(VALIDATION_CHUNK_SIZE) {
507
+
for record in chunk {
508
+
match slices_lexicon::validate_record(
509
+
lexicons.clone(),
510
+
&collection,
511
+
record.json.clone(),
512
+
) {
513
+
Ok(_) => {
514
+
validated_records.push(record.clone());
515
+
}
516
+
Err(e) => {
517
+
let error_msg = format!(
518
+
"Validation failed for record {} from {}: {}",
519
+
record.uri, repo, e
520
+
);
521
+
warn!("{}", error_msg);
522
+
validation_errors.push(json!({
523
+
"uri": record.uri,
524
+
"error": e.to_string()
525
+
}));
526
527
+
// Log individual validation failures
528
+
self.log_with_context(
529
+
slice_uri,
530
+
LogLevel::Warn,
531
+
&error_msg,
532
+
Some(json!({
533
+
"repo": repo,
534
+
"collection": collection,
535
+
"record_uri": record.uri,
536
+
"validation_error": e.to_string()
537
+
})),
538
+
);
539
+
}
540
}
541
}
542
}
···
592
);
593
}
594
595
+
// Add to batch buffer instead of all_records
596
+
batch_buffer.extend(validated_records);
597
successful_tasks += 1;
598
}
599
Err(_) => {
···
607
failed_tasks += 1;
608
}
609
}
610
+
611
+
// Send batch to writer when buffer is full
612
+
if batch_buffer.len() >= BATCH_SIZE {
613
+
let batch_to_send = std::mem::replace(&mut batch_buffer, Vec::with_capacity(BATCH_SIZE));
614
+
let batch_count = batch_to_send.len() as i64;
615
+
info!("Sending batch of {} records to database writer", batch_count);
616
+
617
+
// Send to writer channel (non-blocking)
618
+
if let Err(e) = tx.send(batch_to_send).await {
619
+
error!("Failed to send batch to writer: {}", e);
620
+
return Err(SyncError::Generic(format!("Failed to send batch to writer: {}", e)));
621
+
}
622
+
623
+
let mut total = total_indexed_records.lock().await;
624
+
*total += batch_count;
625
+
}
626
}
627
628
+
// Flush any remaining records in the buffer
629
+
if !batch_buffer.is_empty() {
630
+
let batch_count = batch_buffer.len() as i64;
631
+
info!("Sending final batch of {} records to database writer", batch_count);
632
+
633
+
if let Err(e) = tx.send(batch_buffer).await {
634
+
error!("Failed to send final batch to writer: {}", e);
635
+
return Err(SyncError::Generic(format!("Failed to send final batch to writer: {}", e)));
636
+
}
637
+
638
+
let mut total = total_indexed_records.lock().await;
639
+
*total += batch_count;
640
+
}
641
+
642
+
// Close the channel and wait for writer to finish
643
+
drop(tx);
644
+
let write_result = writer_task.await
645
+
.map_err(|e| SyncError::Generic(format!("Writer task panicked: {}", e)))?;
646
+
647
+
let final_count = match write_result {
648
+
Ok(count) => count,
649
+
Err(e) => return Err(e),
650
+
};
651
+
652
info!(
653
"Debug: {} successful tasks, {} failed tasks",
654
successful_tasks, failed_tasks
655
);
656
0
657
info!(
658
+
"Indexed {} new/changed records in batches",
659
+
final_count
660
);
661
0
0
0
0
0
0
0
0
0
0
0
0
0
662
info!("Backfill complete!");
663
664
+
Ok((valid_repos.len() as i64, final_count))
665
}
666
667
/// Fetch all repositories that have records in a given collection.
···
791
///
792
/// Only returns new or changed records (compared by CID).
793
/// Uses cursor-based pagination to fetch all records.
794
+
///
795
+
/// # Memory optimizations:
796
+
/// - Pre-allocated Vec with 100 capacity (typical collection size)
797
+
/// - Fetches in 100-record pages to limit response size
798
+
/// - Reuses HTTP connections via client pooling
799
async fn fetch_records_for_repo_collection(
800
&self,
801
repo: &str,
···
817
collection
818
);
819
820
+
// Pre-allocate based on typical collection size (100 records)
821
+
// This avoids Vec reallocations which can cause memory fragmentation
822
+
let mut records = Vec::with_capacity(100);
823
let mut cursor: Option<String> = None;
824
let mut fetched_count = 0;
825
let mut skipped_count = 0;
···
886
}
887
888
// Record is new or changed, include it
889
+
// TODO: Consider using Arc<str> for frequently cloned strings
890
let record = Record {
891
uri: atproto_record.uri,
892
cid: atproto_record.cid,
···
938
/// Resolve ATP data (DID, PDS, handle) for multiple repos.
939
///
940
/// Returns a map of DID -> AtpData. Failed resolutions are logged but don't fail the operation.
941
+
///
942
+
/// # Performance optimizations:
943
+
/// - Processes DIDs in 50-item chunks to limit memory usage
944
+
/// - 10 concurrent DNS resolutions max to avoid resolver exhaustion
945
+
/// - Pre-allocated HashMap based on input size
946
async fn get_atp_map_for_repos(
947
&self,
948
repos: &[String],
949
) -> Result<std::collections::HashMap<String, AtpData>, SyncError> {
950
+
let mut atp_map = std::collections::HashMap::with_capacity(repos.len());
951
+
const CHUNK_SIZE: usize = 50; // Process DIDs in chunks
952
+
const MAX_CONCURRENT: usize = 10; // Limit concurrent resolutions
953
+
954
+
info!("Resolving ATP data for {} repositories in chunks", repos.len());
955
956
+
for (chunk_idx, chunk) in repos.chunks(CHUNK_SIZE).enumerate() {
957
+
let chunk_start = chunk_idx * CHUNK_SIZE;
958
+
let chunk_end = std::cmp::min(chunk_start + CHUNK_SIZE, repos.len());
959
+
960
+
debug!(
961
+
"Processing DID resolution chunk {}/{} (repos {}-{})",
962
+
chunk_idx + 1,
963
+
repos.len().div_ceil(CHUNK_SIZE),
964
+
chunk_start,
965
+
chunk_end - 1
966
+
);
967
+
968
+
// Process this chunk with limited concurrency
969
+
let mut resolution_tasks = Vec::new();
970
+
971
+
for batch in chunk.chunks(MAX_CONCURRENT) {
972
+
let mut batch_futures = Vec::new();
973
+
974
+
for repo in batch {
975
+
let repo_clone = repo.clone();
976
+
let self_clone = self.clone();
977
+
978
+
let fut = async move {
979
+
match self_clone.resolve_atp_data(&repo_clone).await {
980
+
Ok(atp_data) => Some((atp_data.did.clone(), atp_data)),
981
+
Err(e) => {
982
+
warn!("Failed to resolve ATP data for {}: {:?}", repo_clone, e);
983
+
None
984
+
}
985
+
}
986
+
};
987
+
batch_futures.push(fut);
988
}
989
+
990
+
// Wait for this batch to complete
991
+
let batch_results = future::join_all(batch_futures).await;
992
+
resolution_tasks.extend(batch_results);
993
+
}
994
+
995
+
// Add resolved data to map
996
+
for (did, atp_data) in resolution_tasks.into_iter().flatten() {
997
+
atp_map.insert(did, atp_data);
998
+
}
999
+
1000
+
// Small delay between chunks to be kind to DNS resolvers
1001
+
if chunk_idx < repos.len().div_ceil(CHUNK_SIZE) - 1 {
1002
+
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
1003
}
1004
}
1005
1006
+
info!("Successfully resolved ATP data for {}/{} repositories", atp_map.len(), repos.len());
1007
Ok(atp_map)
1008
}
1009