tangled
alpha
login
or
join now
trezy.codes
/
quickslice
forked from
slices.network/quickslice
0
fork
atom
Auto-indexing service and GraphQL API for AT Protocol Records
0
fork
atom
overview
issues
pulls
pipelines
skip record insert if cid exists
chadtmiller.com
4 months ago
9f059c3b
36febe33
+241
-65
1 changed file
expand all
collapse all
unified
split
server
src
database.gleam
+241
-65
server/src/database.gleam
···
1
1
import cursor
2
2
+
import gleam/dict.{type Dict}
2
3
import gleam/dynamic/decode
3
4
import gleam/int
4
5
import gleam/list
···
253
254
sqlight.exec(create_table_sql, conn)
254
255
}
255
256
257
257
+
/// Migration v3: Add CID index for deduplication
258
258
+
fn migration_v3(conn: sqlight.Connection) -> Result(Nil, sqlight.Error) {
259
259
+
logging.log(logging.Info, "Running migration v3 (CID index)...")
260
260
+
261
261
+
let create_cid_index_sql =
262
262
+
"
263
263
+
CREATE INDEX IF NOT EXISTS idx_record_cid
264
264
+
ON record(cid)
265
265
+
"
266
266
+
267
267
+
sqlight.exec(create_cid_index_sql, conn)
268
268
+
}
269
269
+
256
270
/// Runs all pending migrations based on current schema version
257
271
fn run_migrations(conn: sqlight.Connection) -> Result(Nil, sqlight.Error) {
258
272
use current_version <- result.try(get_current_version(conn))
···
267
281
// Fresh database or pre-migration database - run v1
268
282
0 -> {
269
283
use _ <- result.try(apply_migration(conn, 1, migration_v1))
270
270
-
apply_migration(conn, 2, migration_v2)
284
284
+
use _ <- result.try(apply_migration(conn, 2, migration_v2))
285
285
+
apply_migration(conn, 3, migration_v3)
271
286
}
272
287
273
273
-
// Run v2 migration
274
274
-
1 -> apply_migration(conn, 2, migration_v2)
288
288
+
// Run v2 and v3 migrations
289
289
+
1 -> {
290
290
+
use _ <- result.try(apply_migration(conn, 2, migration_v2))
291
291
+
apply_migration(conn, 3, migration_v3)
292
292
+
}
293
293
+
294
294
+
// Run v3 migration
295
295
+
2 -> apply_migration(conn, 3, migration_v3)
275
296
276
297
// Already at latest version
277
277
-
2 -> {
278
278
-
logging.log(logging.Info, "Schema is up to date (v2)")
298
298
+
3 -> {
299
299
+
logging.log(logging.Info, "Schema is up to date (v3)")
279
300
Ok(Nil)
280
301
}
281
302
282
303
// Future versions would be handled here:
283
283
-
// 2 -> apply_migration(conn, 3, migration_v3)
284
304
// 3 -> apply_migration(conn, 4, migration_v4)
305
305
+
// 4 -> apply_migration(conn, 5, migration_v5)
285
306
_ -> {
286
307
logging.log(
287
308
logging.Error,
···
455
476
456
477
// ===== Record Functions =====
457
478
479
479
+
/// Gets existing CIDs for a list of URIs
480
480
+
/// Returns a Dict mapping URI -> CID for records that exist in the database
481
481
+
fn get_existing_cids(
482
482
+
conn: sqlight.Connection,
483
483
+
uris: List(String),
484
484
+
) -> Result(Dict(String, String), sqlight.Error) {
485
485
+
case uris {
486
486
+
[] -> Ok(dict.new())
487
487
+
_ -> {
488
488
+
// Build placeholders for SQL IN clause
489
489
+
let placeholders =
490
490
+
list.map(uris, fn(_) { "?" })
491
491
+
|> string.join(", ")
492
492
+
493
493
+
let sql =
494
494
+
"
495
495
+
SELECT uri, cid
496
496
+
FROM record
497
497
+
WHERE uri IN (" <> placeholders <> ")
498
498
+
"
499
499
+
500
500
+
// Convert URIs to sqlight.Value list
501
501
+
let params = list.map(uris, sqlight.text)
502
502
+
503
503
+
let decoder = {
504
504
+
use uri <- decode.field(0, decode.string)
505
505
+
use cid <- decode.field(1, decode.string)
506
506
+
decode.success(#(uri, cid))
507
507
+
}
508
508
+
509
509
+
use results <- result.try(sqlight.query(
510
510
+
sql,
511
511
+
on: conn,
512
512
+
with: params,
513
513
+
expecting: decoder,
514
514
+
))
515
515
+
516
516
+
// Convert list of tuples to Dict
517
517
+
Ok(dict.from_list(results))
518
518
+
}
519
519
+
}
520
520
+
}
521
521
+
458
522
/// Inserts or updates a record in the database
523
523
+
/// Skips insertion if the CID already exists in the database (for any URI)
524
524
+
/// Also skips update if the URI exists with the same CID (content unchanged)
459
525
pub fn insert_record(
460
526
conn: sqlight.Connection,
461
527
uri: String,
···
464
530
collection: String,
465
531
json: String,
466
532
) -> Result(Nil, sqlight.Error) {
467
467
-
let sql =
468
468
-
"
469
469
-
INSERT INTO record (uri, cid, did, collection, json)
470
470
-
VALUES (?, ?, ?, ?, ?)
471
471
-
ON CONFLICT(uri) DO UPDATE SET
472
472
-
cid = excluded.cid,
473
473
-
json = excluded.json,
474
474
-
indexed_at = datetime('now')
475
475
-
"
533
533
+
// Check if this CID already exists in the database
534
534
+
use existing_cids <- result.try(get_existing_cids(conn, [uri]))
476
535
477
477
-
use _ <- result.try(sqlight.query(
478
478
-
sql,
479
479
-
on: conn,
480
480
-
with: [
481
481
-
sqlight.text(uri),
482
482
-
sqlight.text(cid),
483
483
-
sqlight.text(did),
484
484
-
sqlight.text(collection),
485
485
-
sqlight.text(json),
486
486
-
],
487
487
-
expecting: decode.string,
488
488
-
))
489
489
-
Ok(Nil)
536
536
+
case dict.get(existing_cids, uri) {
537
537
+
// URI exists with same CID - skip update (content unchanged)
538
538
+
Ok(existing_cid) if existing_cid == cid -> Ok(Nil)
539
539
+
// URI exists with different CID - proceed with update
540
540
+
// URI doesn't exist - proceed with insert
541
541
+
_ -> {
542
542
+
// Check if this CID exists for any other URI
543
543
+
let check_cid_sql =
544
544
+
"
545
545
+
SELECT COUNT(*) as count
546
546
+
FROM record
547
547
+
WHERE cid = ?
548
548
+
"
549
549
+
550
550
+
let count_decoder = {
551
551
+
use count <- decode.field(0, decode.int)
552
552
+
decode.success(count)
553
553
+
}
554
554
+
555
555
+
use cid_exists <- result.try(case
556
556
+
sqlight.query(
557
557
+
check_cid_sql,
558
558
+
on: conn,
559
559
+
with: [sqlight.text(cid)],
560
560
+
expecting: count_decoder,
561
561
+
)
562
562
+
{
563
563
+
Ok([count]) if count > 0 -> Ok(True)
564
564
+
Ok(_) -> Ok(False)
565
565
+
Error(err) -> Error(err)
566
566
+
})
567
567
+
568
568
+
case cid_exists {
569
569
+
True -> Ok(Nil)
570
570
+
False -> {
571
571
+
let sql =
572
572
+
"
573
573
+
INSERT INTO record (uri, cid, did, collection, json)
574
574
+
VALUES (?, ?, ?, ?, ?)
575
575
+
ON CONFLICT(uri) DO UPDATE SET
576
576
+
cid = excluded.cid,
577
577
+
json = excluded.json,
578
578
+
indexed_at = datetime('now')
579
579
+
"
580
580
+
581
581
+
use _ <- result.try(sqlight.query(
582
582
+
sql,
583
583
+
on: conn,
584
584
+
with: [
585
585
+
sqlight.text(uri),
586
586
+
sqlight.text(cid),
587
587
+
sqlight.text(did),
588
588
+
sqlight.text(collection),
589
589
+
sqlight.text(json),
590
590
+
],
591
591
+
expecting: decode.string,
592
592
+
))
593
593
+
Ok(Nil)
594
594
+
}
595
595
+
}
596
596
+
}
597
597
+
}
490
598
}
491
599
492
600
/// Batch inserts or updates multiple records in the database
493
601
/// More efficient than individual inserts for large datasets
602
602
+
/// Filters out records where CID already exists or is unchanged
494
603
pub fn batch_insert_records(
495
604
conn: sqlight.Connection,
496
605
records: List(Record),
497
606
) -> Result(Nil, sqlight.Error) {
498
498
-
// Process records in smaller batches to avoid SQL parameter limits
499
499
-
// SQLite has a default limit of 999 parameters
500
500
-
// Each record uses 5 parameters, so we can safely do 100 records at a time (500 params)
501
501
-
let batch_size = 100
607
607
+
case records {
608
608
+
[] -> Ok(Nil)
609
609
+
_ -> {
610
610
+
// Get all URIs from the incoming records
611
611
+
let uris = list.map(records, fn(record) { record.uri })
612
612
+
613
613
+
// Fetch existing CIDs for these URIs
614
614
+
use existing_cids <- result.try(get_existing_cids(conn, uris))
615
615
+
616
616
+
// Get all CIDs that already exist in the database (for any URI)
617
617
+
let all_incoming_cids = list.map(records, fn(record) { record.cid })
618
618
+
let check_all_cids_sql =
619
619
+
"
620
620
+
SELECT cid
621
621
+
FROM record
622
622
+
WHERE cid IN ("
623
623
+
<> string.join(list.map(all_incoming_cids, fn(_) { "?" }), ", ")
624
624
+
<> ")
625
625
+
"
626
626
+
627
627
+
let cid_decoder = {
628
628
+
use cid <- decode.field(0, decode.string)
629
629
+
decode.success(cid)
630
630
+
}
631
631
+
632
632
+
use existing_cids_in_db <- result.try(sqlight.query(
633
633
+
check_all_cids_sql,
634
634
+
on: conn,
635
635
+
with: list.map(all_incoming_cids, sqlight.text),
636
636
+
expecting: cid_decoder,
637
637
+
))
638
638
+
639
639
+
// Create a set of existing CIDs for fast lookup
640
640
+
let existing_cid_set = dict.from_list(
641
641
+
list.map(existing_cids_in_db, fn(cid) { #(cid, True) }),
642
642
+
)
643
643
+
644
644
+
// Filter out records where:
645
645
+
// 1. URI exists with same CID (unchanged)
646
646
+
// 2. CID already exists for a different URI (duplicate content)
647
647
+
let filtered_records =
648
648
+
list.filter(records, fn(record) {
649
649
+
case dict.get(existing_cids, record.uri) {
650
650
+
// URI exists with same CID - skip
651
651
+
Ok(existing_cid) if existing_cid == record.cid -> False
652
652
+
// URI exists with different CID - include (content changed)
653
653
+
Ok(_) ->
654
654
+
case dict.get(existing_cid_set, record.cid) {
655
655
+
Ok(_) -> False
656
656
+
Error(_) -> True
657
657
+
}
658
658
+
// URI doesn't exist - check if CID exists elsewhere
659
659
+
Error(_) ->
660
660
+
case dict.get(existing_cid_set, record.cid) {
661
661
+
Ok(_) -> False
662
662
+
Error(_) -> True
663
663
+
}
664
664
+
}
665
665
+
})
666
666
+
667
667
+
case filtered_records {
668
668
+
[] -> Ok(Nil)
669
669
+
_ -> {
670
670
+
// Process records in smaller batches to avoid SQL parameter limits
671
671
+
// SQLite has a default limit of 999 parameters
672
672
+
// Each record uses 5 parameters, so we can safely do 100 records at a time (500 params)
673
673
+
let batch_size = 100
502
674
503
503
-
list.sized_chunk(records, batch_size)
504
504
-
|> list.try_each(fn(batch) {
505
505
-
// Build the SQL with multiple value sets
506
506
-
let value_placeholders =
507
507
-
list.repeat("(?, ?, ?, ?, ?)", list.length(batch))
508
508
-
|> string.join(", ")
675
675
+
list.sized_chunk(filtered_records, batch_size)
676
676
+
|> list.try_each(fn(batch) {
677
677
+
// Build the SQL with multiple value sets
678
678
+
let value_placeholders =
679
679
+
list.repeat("(?, ?, ?, ?, ?)", list.length(batch))
680
680
+
|> string.join(", ")
509
681
510
510
-
let sql = "
511
511
-
INSERT INTO record (uri, cid, did, collection, json)
512
512
-
VALUES " <> value_placeholders <> "
513
513
-
ON CONFLICT(uri) DO UPDATE SET
514
514
-
cid = excluded.cid,
515
515
-
json = excluded.json,
516
516
-
indexed_at = datetime('now')
517
517
-
"
682
682
+
let sql = "
683
683
+
INSERT INTO record (uri, cid, did, collection, json)
684
684
+
VALUES " <> value_placeholders <> "
685
685
+
ON CONFLICT(uri) DO UPDATE SET
686
686
+
cid = excluded.cid,
687
687
+
json = excluded.json,
688
688
+
indexed_at = datetime('now')
689
689
+
"
518
690
519
519
-
// Flatten all record parameters into a single list
520
520
-
let params =
521
521
-
list.flat_map(batch, fn(record) {
522
522
-
[
523
523
-
sqlight.text(record.uri),
524
524
-
sqlight.text(record.cid),
525
525
-
sqlight.text(record.did),
526
526
-
sqlight.text(record.collection),
527
527
-
sqlight.text(record.json),
528
528
-
]
529
529
-
})
691
691
+
// Flatten all record parameters into a single list
692
692
+
let params =
693
693
+
list.flat_map(batch, fn(record) {
694
694
+
[
695
695
+
sqlight.text(record.uri),
696
696
+
sqlight.text(record.cid),
697
697
+
sqlight.text(record.did),
698
698
+
sqlight.text(record.collection),
699
699
+
sqlight.text(record.json),
700
700
+
]
701
701
+
})
530
702
531
531
-
use _ <- result.try(sqlight.query(
532
532
-
sql,
533
533
-
on: conn,
534
534
-
with: params,
535
535
-
expecting: decode.string,
536
536
-
))
537
537
-
Ok(Nil)
538
538
-
})
703
703
+
use _ <- result.try(sqlight.query(
704
704
+
sql,
705
705
+
on: conn,
706
706
+
with: params,
707
707
+
expecting: decode.string,
708
708
+
))
709
709
+
Ok(Nil)
710
710
+
})
711
711
+
}
712
712
+
}
713
713
+
}
714
714
+
}
539
715
}
540
716
541
717
/// Gets a record by URI