objective categorical abstract machine language personal data server

Improve MST/repo performance

futur.blue 791c68e2 57e06aab

verified
+600 -362
+14 -9
ipld/lib/dag_cbor.ml
··· 1 1 module String_map = Map.Make (String) 2 2 3 + (* sort map keys by length first, then lexicographically *) 4 + let dag_cbor_key_compare a b = 5 + let la = String.length a in 6 + let lb = String.length b in 7 + if la = lb then String.compare a b else compare la lb 8 + 3 9 let ordered_map_keys (m : 'a String_map.t) : string list = 4 10 let keys = String_map.bindings m |> List.map fst in 5 - List.sort 6 - (fun a b -> 7 - let la = String.length a in 8 - let lb = String.length b in 9 - if la = lb then String.compare a b else compare la lb ) 10 - keys 11 + List.sort dag_cbor_key_compare keys 12 + 13 + (* returns bindings sorted in dag-cbor canonical order *) 14 + let ordered_map_bindings (m : 'a String_map.t) : (string * 'a) list = 15 + String_map.bindings m |> List.sort (fun (a, _) (b, _) -> dag_cbor_key_compare a b) 11 16 12 17 let type_info_length len = 13 18 if len < 24 then 1 ··· 195 200 | `Map m -> 196 201 let len = String_map.cardinal m in 197 202 write_type_and_argument t 5 (Int64.of_int len) ; 198 - ordered_map_keys m 199 - |> List.iter (fun k -> 203 + ordered_map_bindings m 204 + |> List.iter (fun (k, v) -> 200 205 write_string t k ; 201 - write_value t (String_map.find k m) ) 206 + write_value t v ) 202 207 | `Link cid -> 203 208 write_cid t cid 204 209
+400 -155
mist/lib/mst.ml
··· 208 208 209 209 val proof_for_key : t -> Cid.t -> string -> Block_map.t Lwt.t 210 210 211 + val proof_for_keys : t -> Cid.t -> string list -> Block_map.t Lwt.t 212 + 211 213 val leaf_count : t -> int Lwt.t 212 214 213 215 val layer : t -> int Lwt.t ··· 246 248 247 249 let create blockstore root = {blockstore; root} 248 250 251 + let entries_are_sorted (entries : entry list) : bool = 252 + let rec aux prev_key = function 253 + | [] -> 254 + true 255 + | e :: tl -> 256 + if String.compare prev_key e.key <= 0 then aux e.key tl else false 257 + in 258 + match entries with [] -> true | e :: tl -> aux e.key tl 259 + 260 + (* we try to batch reads from the blockstore when possible 261 + 200 seems like a sane upper limit for that *) 262 + let batch_size = 200 263 + 264 + let take_n n lst = 265 + if n <= 0 then ([], lst) 266 + else 267 + let rec loop acc remaining xs = 268 + match (remaining, xs) with 269 + | 0, _ -> 270 + (List.rev acc, xs) 271 + | _, [] -> 272 + (List.rev acc, []) 273 + | _, x :: xs' -> 274 + loop (x :: acc) (remaining - 1) xs' 275 + in 276 + loop [] n lst 277 + 278 + let get_blocks_exn (t : t) (cids : Cid.t list) : Block_map.t Lwt.t = 279 + if List.is_empty cids then Lwt.return Block_map.empty 280 + else 281 + let%lwt bm = Store.get_blocks t.blockstore cids in 282 + match bm.missing with 283 + | [] -> 284 + Lwt.return bm.blocks 285 + | missing :: _ -> 286 + failwith ("missing mst node block: " ^ Cid.to_string missing) 287 + 249 288 (* retrieves a raw node by cid *) 250 289 let retrieve_node_raw t cid : node_raw option Lwt.t = 251 290 match%lwt Store.get_bytes t.blockstore cid with ··· 288 327 | None -> 289 328 lazy Lwt.return_none 290 329 in 330 + let last_key = ref "" in 291 331 let entries = 292 - List.fold_left 293 - (fun (entries : entry list) entry -> 332 + List.map 333 + (fun entry -> 294 334 let prefix = 295 - match entries with 296 - | [] -> 297 - "" 298 - | prev :: _ -> 299 - String.sub prev.key 0 entry.p 335 + if entry.p = 0 then "" 336 + else if !last_key = "" then "" 337 + else String.sub !last_key 0 entry.p 300 338 in 301 339 let path = String.concat "" [prefix; Bytes.to_string entry.k] in 302 340 Util.ensure_valid_key path ; 341 + last_key := path ; 303 342 let right = 304 343 match entry.t with 305 344 | Some r -> ··· 307 346 | None -> 308 347 lazy Lwt.return_none 309 348 in 310 - ({layer; key= path; value= entry.v; right} : entry) :: entries ) 311 - [] node_raw.e 349 + ({layer; key= path; value= entry.v; right} : entry) ) 350 + node_raw.e 312 351 in 313 352 Lwt.return {layer; left; entries} 314 353 ··· 349 388 (* returns a map of key -> cid *) 350 389 let build_map t : Cid.t String_map.t Lwt.t = 351 390 let map = ref String_map.empty in 352 - let%lwt () = 353 - traverse t (fun path cid -> map := String_map.add path cid !map) 391 + let rec loop queue visited = 392 + match queue with 393 + | [] -> 394 + Lwt.return !map 395 + | _ -> 396 + let batch, rest = take_n batch_size queue in 397 + let to_fetch = 398 + List.filter (fun (cid, _) -> not (Cid.Set.mem cid visited)) batch 399 + in 400 + let%lwt blocks = get_blocks_exn t (List.map fst to_fetch) in 401 + let visited', next_queue = 402 + List.fold_left 403 + (fun (visited, queue) (cid, prefix) -> 404 + if Cid.Set.mem cid visited then (visited, queue) 405 + else 406 + let bytes = Block_map.get cid blocks |> Option.get in 407 + let raw = decode_block_raw bytes in 408 + let last_key = ref prefix in 409 + let next_pairs = 410 + List.fold_left 411 + (fun acc (e : entry_raw) -> 412 + let key_prefix = 413 + if e.p = 0 then "" 414 + else if e.p <= String.length !last_key then 415 + String.sub !last_key 0 e.p 416 + else !last_key 417 + in 418 + let full_key = key_prefix ^ Bytes.to_string e.k in 419 + Util.ensure_valid_key full_key ; 420 + last_key := full_key ; 421 + map := String_map.add full_key e.v !map ; 422 + match e.t with 423 + | Some r -> 424 + (r, full_key) :: acc 425 + | None -> 426 + acc ) 427 + ( match raw.l with 428 + | Some l -> 429 + [(l, prefix)] 430 + | None -> 431 + [] ) 432 + raw.e 433 + in 434 + (Cid.Set.add cid visited, List.rev_append next_pairs queue) ) 435 + (visited, rest) batch 436 + in 437 + loop next_queue visited' 354 438 in 355 - Lwt.return !map 439 + loop [(t.root, "")] Cid.Set.empty 356 440 357 441 (* returns all non-leaf mst node blocks in order for a car stream 358 442 leaf cids can be obtained via collect_nodes_and_leaves or leaves_of_root *) ··· 404 488 for each node: node block, left subtree, then for each entry: record, right subtree *) 405 489 let to_ordered_stream t : ordered_item Lwt_seq.t = 406 490 (* queue items: `Node cid to visit, `Leaf cid to yield *) 407 - let rec step queue = 491 + let prefetch queue cache missing = 492 + let rec collect acc seen remaining = function 493 + | [] -> 494 + (List.rev acc, seen) 495 + | _ when remaining = 0 -> 496 + (List.rev acc, seen) 497 + | `Node cid :: rest -> 498 + if 499 + Cid.Set.mem cid missing 500 + || Block_map.has cid cache 501 + || Cid.Set.mem cid seen 502 + then collect acc seen remaining rest 503 + else 504 + collect (cid :: acc) (Cid.Set.add cid seen) (remaining - 1) rest 505 + | _ :: rest -> 506 + collect acc seen remaining rest 507 + in 508 + let cids, _seen = collect [] Cid.Set.empty batch_size queue in 509 + if List.is_empty cids then Lwt.return (cache, missing) 510 + else 511 + let%lwt bm = Store.get_blocks t.blockstore cids in 512 + let cache' = 513 + List.fold_left 514 + (fun acc (cid, bytes) -> Block_map.set cid bytes acc) 515 + cache (Block_map.entries bm.blocks) 516 + in 517 + let missing' = 518 + List.fold_left 519 + (fun acc cid -> Cid.Set.add cid acc) 520 + missing bm.missing 521 + in 522 + Lwt.return (cache', missing') 523 + in 524 + let rec step (queue, cache, missing) = 408 525 match queue with 409 526 | [] -> 410 527 Lwt.return_none 411 - | `Node cid :: rest -> ( 412 - let%lwt bytes_opt = Store.get_bytes t.blockstore cid in 413 - match bytes_opt with 414 - | None -> 415 - step rest 416 - | Some bytes -> 417 - let node = decode_block_raw bytes in 418 - (* queue items: left subtree, then for each entry: record then right subtree *) 419 - let left_queue = 420 - match node.l with Some l -> [`Node l] | None -> [] 421 - in 422 - let entries_queue = 423 - List.concat_map 424 - (fun (e : entry_raw) -> 425 - let right_queue = 426 - match e.t with Some r -> [`Node r] | None -> [] 427 - in 428 - `Leaf e.v :: right_queue ) 429 - node.e 430 - in 431 - let new_queue = left_queue @ entries_queue @ rest in 432 - Lwt.return_some ((Node (cid, bytes) : ordered_item), new_queue) ) 433 528 | `Leaf cid :: rest -> 434 - Lwt.return_some ((Leaf cid : ordered_item), rest) 529 + Lwt.return_some ((Leaf cid : ordered_item), (rest, cache, missing)) 530 + | `Node cid :: rest -> 531 + if Cid.Set.mem cid missing then step (rest, cache, missing) 532 + else 533 + ( match Block_map.get cid cache with 534 + | None -> 535 + let%lwt cache', missing' = prefetch queue cache missing in 536 + if cache' == cache && Cid.Set.mem cid missing' then 537 + step (rest, cache', missing') 538 + else step (queue, cache', missing') 539 + | Some bytes -> 540 + let node = decode_block_raw bytes in 541 + (* queue items: left subtree, then for each entry: record then right subtree *) 542 + let left_queue = 543 + match node.l with Some l -> [`Node l] | None -> [] 544 + in 545 + let entries_queue = 546 + List.concat_map 547 + (fun (e : entry_raw) -> 548 + let right_queue = 549 + match e.t with Some r -> [`Node r] | None -> [] 550 + in 551 + `Leaf e.v :: right_queue ) 552 + node.e 553 + in 554 + let new_queue = left_queue @ entries_queue @ rest in 555 + let cache' = Block_map.remove cid cache in 556 + Lwt.return_some 557 + ((Node (cid, bytes) : ordered_item), (new_queue, cache', missing)) 558 + ) 435 559 in 436 - Lwt_seq.unfold_lwt step [`Node t.root] 560 + Lwt_seq.unfold_lwt step ([`Node t.root], Block_map.empty, Cid.Set.empty) 437 561 438 562 (* produces a cid and cbor-encoded bytes for a given tree *) 439 563 let serialize t node : (Cid.t * bytes, exn) Lwt_result.t = 440 - let sorted_entries = 441 - List.sort (fun (a : entry) b -> String.compare a.key b.key) node.entries 442 - in 443 564 let rec aux node : (Cid.t * bytes) Lwt.t = 565 + let entries = 566 + if entries_are_sorted node.entries then node.entries 567 + else 568 + List.sort (fun (a : entry) b -> String.compare a.key b.key) 569 + node.entries 570 + in 444 571 let%lwt left = 445 572 node.left 446 573 >>? function ··· 473 600 ; p= prefix_len 474 601 ; v= entry.value 475 602 ; t= right } ) 476 - node.entries 603 + entries 477 604 in 478 605 let encoded = 479 606 Dag_cbor.encode (encode_node_raw {l= left; e= mst_entries}) ··· 485 612 | Error e -> 486 613 raise e 487 614 in 488 - try%lwt Lwt.map Result.ok (aux {node with entries= sorted_entries}) 615 + try%lwt Lwt.map Result.ok (aux node) 489 616 with e -> Lwt.return_error e 490 617 491 618 (* raw-node helpers for covering proofs: operate on stored bytes, not re-serialization *) ··· 545 672 let seq = interleave_raw raw keys in 546 673 let index = find_gte_leaf_index key seq in 547 674 let%lwt blocks = 548 - match Util.at_index index seq with 675 + match List.nth_opt seq index with 549 676 | Some (Leaf (k, _, _)) when k = key -> 550 677 Lwt.return Block_map.empty 551 678 | Some (Leaf (_k, v_right, _)) -> ( 552 679 let prev = 553 - if index - 1 >= 0 then Util.at_index (index - 1) seq else None 680 + if index - 1 >= 0 then List.nth_opt seq (index - 1) else None 554 681 in 555 682 match prev with 556 683 | Some (Tree c) -> ··· 587 714 proof_for_key t c key 588 715 | None -> ( 589 716 let prev = 590 - if index - 1 >= 0 then Util.at_index (index - 1) seq else None 717 + if index - 1 >= 0 then List.nth_opt seq (index - 1) else None 591 718 in 592 719 match prev with 593 720 | Some (Tree c) -> ··· 602 729 None 603 730 in 604 731 let right_leaf = 605 - match Util.at_index index seq with 732 + match List.nth_opt seq index with 606 733 | Some (Leaf (_, v_right, _)) -> 607 734 Some v_right 608 735 | _ -> ··· 634 761 in 635 762 Lwt.return (Block_map.set cid bytes blocks) 636 763 764 + let proof_for_keys t cid keys : Block_map.t Lwt.t = 765 + if List.is_empty keys then Lwt.return Block_map.empty 766 + else 767 + let keys = List.sort_uniq String.compare keys in 768 + let cache = ref Block_map.empty in 769 + let missing = ref Cid.Set.empty in 770 + let acc = ref Block_map.empty in 771 + let add_block cid bytes = 772 + if not (Block_map.has cid !acc) then 773 + acc := Block_map.set cid bytes !acc 774 + in 775 + let get_bytes_cached cid = 776 + match Block_map.get cid !cache with 777 + | Some bytes -> 778 + Lwt.return_some bytes 779 + | None -> 780 + if Cid.Set.mem cid !missing then Lwt.return_none 781 + else 782 + let%lwt bytes_opt = Store.get_bytes t.blockstore cid in 783 + ( match bytes_opt with 784 + | Some bytes -> 785 + cache := Block_map.set cid bytes !cache 786 + | None -> 787 + missing := Cid.Set.add cid !missing ) ; 788 + Lwt.return bytes_opt 789 + in 790 + let add_leaf cid_opt = 791 + match cid_opt with 792 + | None -> 793 + Lwt.return_unit 794 + | Some leaf_cid -> ( 795 + match%lwt get_bytes_cached leaf_cid with 796 + | Some bytes -> 797 + add_block leaf_cid bytes ; 798 + Lwt.return_unit 799 + | None -> 800 + Lwt.return_unit ) 801 + in 802 + let rec proof_for_key_cached cid key = 803 + match%lwt get_bytes_cached cid with 804 + | None -> 805 + Lwt.return_unit 806 + | Some bytes -> 807 + add_block cid bytes ; 808 + let raw = decode_block_raw bytes in 809 + let keys = node_entry_keys raw in 810 + let seq = interleave_raw raw keys in 811 + let index = find_gte_leaf_index key seq in 812 + ( match List.nth_opt seq index with 813 + | Some (Leaf (k, _, _)) when k = key -> 814 + Lwt.return_unit 815 + | Some (Leaf (_k, v_right, _)) -> ( 816 + let prev = 817 + if index - 1 >= 0 then List.nth_opt seq (index - 1) else None 818 + in 819 + match prev with 820 + | Some (Tree c) -> 821 + proof_for_key_cached c key 822 + | _ -> 823 + let left_leaf = 824 + match prev with 825 + | Some (Leaf (_, v_left, _)) -> 826 + Some v_left 827 + | _ -> 828 + None 829 + in 830 + let%lwt () = add_leaf left_leaf in 831 + add_leaf (Some v_right) ) 832 + | Some (Tree c) -> 833 + proof_for_key_cached c key 834 + | None -> ( 835 + let prev = 836 + if index - 1 >= 0 then List.nth_opt seq (index - 1) else None 837 + in 838 + match prev with 839 + | Some (Tree c) -> 840 + proof_for_key_cached c key 841 + | _ -> 842 + let left_leaf = 843 + match prev with 844 + | Some (Leaf (_, v_left, _)) -> 845 + Some v_left 846 + | _ -> 847 + None 848 + in 849 + let right_leaf = 850 + match List.nth_opt seq index with 851 + | Some (Leaf (_, v_right, _)) -> 852 + Some v_right 853 + | _ -> 854 + None 855 + in 856 + let%lwt () = add_leaf left_leaf in 857 + add_leaf right_leaf ) ) 858 + in 859 + let%lwt () = Lwt_list.iter_s (proof_for_key_cached cid) keys in 860 + Lwt.return !acc 861 + 637 862 (* collects all node blocks (cid, bytes) and all leaf cids reachable from root 638 863 only traverses nodes; doesn't fetch leaf blocks 639 864 returns (nodes, visited, leaves) *) 640 865 let collect_nodes_and_leaves t : 641 866 ((Cid.t * bytes) list * Cid.Set.t * Cid.Set.t) Lwt.t = 642 - let rec bfs (queue : Cid.t list) (visited : Cid.Set.t) 643 - (nodes : (Cid.t * bytes) list) (leaves : Cid.Set.t) = 867 + let rec loop queue visited nodes leaves = 644 868 match queue with 645 869 | [] -> 646 870 Lwt.return (nodes, visited, leaves) 647 - | cid :: rest -> ( 648 - if Cid.Set.mem cid visited then bfs rest visited nodes leaves 649 - else 650 - let%lwt bytes_opt = Store.get_bytes t.blockstore cid in 651 - match bytes_opt with 652 - | None -> 653 - failwith ("missing mst node block: " ^ Cid.to_string cid) 654 - | Some bytes -> 655 - let raw = decode_block_raw bytes in 656 - (* queue subtrees *) 657 - let next_cids = 658 - let acc = match raw.l with Some l -> [l] | None -> [] in 659 - List.fold_left 660 - (fun acc e -> 661 - match e.t with Some r -> r :: acc | None -> acc ) 662 - acc raw.e 663 - in 664 - (* accumulate leaf cids *) 665 - let leaves' = 666 - List.fold_left (fun s e -> Cid.Set.add e.v s) leaves raw.e 667 - in 668 - let visited' = Cid.Set.add cid visited in 669 - bfs 670 - (List.rev_append next_cids rest) 671 - visited' ((cid, bytes) :: nodes) leaves' ) 871 + | _ -> 872 + let batch, rest = take_n batch_size queue in 873 + let to_fetch = 874 + List.filter (fun cid -> not (Cid.Set.mem cid visited)) batch 875 + in 876 + let%lwt blocks = get_blocks_exn t to_fetch in 877 + let visited', nodes', leaves', next_queue = 878 + List.fold_left 879 + (fun (visited, nodes, leaves, queue) cid -> 880 + if Cid.Set.mem cid visited then (visited, nodes, leaves, queue) 881 + else 882 + let bytes = Block_map.get cid blocks |> Option.get in 883 + let raw = decode_block_raw bytes in 884 + let next_cids = 885 + let acc = match raw.l with Some l -> [l] | None -> [] in 886 + List.fold_left 887 + (fun acc e -> 888 + match e.t with Some r -> r :: acc | None -> acc ) 889 + acc raw.e 890 + in 891 + let leaves' = 892 + List.fold_left (fun s e -> Cid.Set.add e.v s) leaves raw.e 893 + in 894 + let visited' = Cid.Set.add cid visited in 895 + ( visited' 896 + , (cid, bytes) :: nodes 897 + , leaves' 898 + , List.rev_append next_cids queue ) ) 899 + (visited, nodes, leaves, rest) batch 900 + in 901 + loop next_queue visited' nodes' leaves' 672 902 in 673 - bfs [t.root] Cid.Set.empty [] Cid.Set.empty 903 + loop [t.root] Cid.Set.empty [] Cid.Set.empty 674 904 675 905 (* list of all leaves belonging to a node and its children, ordered by key *) 676 906 let rec leaves_of_node n : (string * Cid.t) list Lwt.t = 677 907 let%lwt left_leaves = 678 908 n.left >>? function Some l -> leaves_of_node l | None -> Lwt.return [] 679 909 in 680 - let sorted_entries = 681 - List.sort 682 - (fun (a : entry) (b : entry) -> String.compare a.key b.key) 683 - n.entries 910 + let entries = 911 + if entries_are_sorted n.entries then n.entries 912 + else 913 + List.sort 914 + (fun (a : entry) (b : entry) -> String.compare a.key b.key) 915 + n.entries 684 916 in 685 - let%lwt leaves = 686 - Lwt_list.fold_left_s 687 - (fun acc e -> 917 + let%lwt entry_sublists = 918 + Lwt_list.map_s 919 + (fun e -> 688 920 let%lwt right_leaves = 689 921 e.right 690 922 >>? function Some r -> leaves_of_node r | None -> Lwt.return [] 691 923 in 692 - Lwt.return (acc @ [(e.key, e.value)] @ right_leaves) ) 693 - left_leaves sorted_entries 924 + Lwt.return ((e.key, e.value) :: right_leaves) ) 925 + entries 694 926 in 695 - Lwt.return leaves 927 + Lwt.return (left_leaves @ List.concat entry_sublists) 696 928 697 929 (* list of all leaves in the mst *) 698 930 let leaves_of_root t : (string * Cid.t) list Lwt.t = ··· 704 936 705 937 (* returns a count of all leaves in the mst *) 706 938 let leaf_count t : int Lwt.t = 707 - match%lwt retrieve_node t t.root with 708 - | None -> 709 - failwith "root cid not found in repo store" 710 - | Some root -> 711 - let rec count (n : node) : int Lwt.t = 712 - let%lwt left_count = 713 - n.left >>? function Some l -> count l | None -> Lwt.return 0 939 + let rec loop queue visited acc = 940 + match queue with 941 + | [] -> 942 + Lwt.return acc 943 + | _ -> 944 + let batch, rest = take_n batch_size queue in 945 + let to_fetch = 946 + List.filter (fun cid -> not (Cid.Set.mem cid visited)) batch 714 947 in 715 - let%lwt right_counts = 716 - Lwt_list.map_s 717 - (fun (e : entry) -> 718 - e.right >>? function Some r -> count r | None -> Lwt.return 0 ) 719 - n.entries 948 + let%lwt blocks = get_blocks_exn t to_fetch in 949 + let visited', acc', next_queue = 950 + List.fold_left 951 + (fun (visited, acc, queue) cid -> 952 + if Cid.Set.mem cid visited then (visited, acc, queue) 953 + else 954 + let bytes = Block_map.get cid blocks |> Option.get in 955 + let raw = decode_block_raw bytes in 956 + let next_cids = 957 + let acc = match raw.l with Some l -> [l] | None -> [] in 958 + List.fold_left 959 + (fun acc e -> 960 + match e.t with Some r -> r :: acc | None -> acc ) 961 + acc raw.e 962 + in 963 + let visited' = Cid.Set.add cid visited in 964 + ( visited' 965 + , acc + List.length raw.e 966 + , List.rev_append next_cids queue ) ) 967 + (visited, acc, rest) batch 720 968 in 721 - let sum_right = List.fold_left ( + ) 0 right_counts in 722 - Lwt.return (left_count + List.length n.entries + sum_right) 723 - in 724 - count root 969 + loop next_queue visited' acc' 970 + in 971 + loop [t.root] Cid.Set.empty 0 725 972 726 973 (* returns height of mst root *) 727 974 let layer t : int Lwt.t = ··· 733 980 734 981 (* returns all nodes sorted by cid *) 735 982 let all_nodes t : (Cid.t * bytes) list Lwt.t = 736 - let rec bfs (queue : Cid.t list) (visited : Cid.Set.t) 737 - (nodes : (Cid.t * bytes) list) = 983 + let rec loop queue visited nodes = 738 984 match queue with 739 985 | [] -> 740 986 Lwt.return nodes 741 - | cid :: rest -> ( 742 - if Cid.Set.mem cid visited then bfs rest visited nodes 743 - else 744 - match%lwt Store.get_bytes t.blockstore cid with 745 - | None -> 746 - failwith ("missing mst node block: " ^ Cid.to_string cid) 747 - | Some bytes -> 748 - let raw = decode_block_raw bytes in 749 - let next_cids = 750 - let acc = match raw.l with Some l -> [l] | None -> [] in 751 - List.fold_left 752 - (fun acc e -> 753 - match e.t with Some r -> r :: acc | None -> acc ) 754 - acc raw.e 755 - in 756 - let visited' = Cid.Set.add cid visited in 757 - bfs 758 - (List.rev_append next_cids rest) 759 - visited' ((cid, bytes) :: nodes) ) 987 + | _ -> 988 + let batch, rest = take_n batch_size queue in 989 + let to_fetch = 990 + List.filter (fun cid -> not (Cid.Set.mem cid visited)) batch 991 + in 992 + let%lwt blocks = get_blocks_exn t to_fetch in 993 + let visited', nodes', next_queue = 994 + List.fold_left 995 + (fun (visited, nodes, queue) cid -> 996 + if Cid.Set.mem cid visited then (visited, nodes, queue) 997 + else 998 + let bytes = Block_map.get cid blocks |> Option.get in 999 + let raw = decode_block_raw bytes in 1000 + let next_cids = 1001 + let acc = match raw.l with Some l -> [l] | None -> [] in 1002 + List.fold_left 1003 + (fun acc e -> 1004 + match e.t with Some r -> r :: acc | None -> acc ) 1005 + acc raw.e 1006 + in 1007 + let visited' = Cid.Set.add cid visited in 1008 + ( visited' 1009 + , (cid, bytes) :: nodes 1010 + , List.rev_append next_cids queue ) ) 1011 + (visited, nodes, rest) batch 1012 + in 1013 + loop next_queue visited' nodes' 760 1014 in 761 - let%lwt nodes = bfs [t.root] Cid.Set.empty [] in 1015 + let%lwt nodes = loop [t.root] Cid.Set.empty [] in 762 1016 let sorted = 763 1017 List.sort 764 1018 (fun (a, _) (b, _) -> String.compare (Cid.to_string a) (Cid.to_string b)) ··· 803 1057 let root_layer = 804 1058 List.fold_left (fun acc (_, _, lz) -> max acc lz) 0 with_layers 805 1059 in 806 - let on_layer = 807 - List.filter (fun (_, _, lz) -> lz = root_layer) with_layers 808 - |> List.map (fun (k, v, _) -> (k, v)) 809 - in 810 - (* left group is keys below first on-layer key *) 811 - let left_group = 812 - match on_layer with 813 - | (k0, _) :: _ -> 814 - List.filter 815 - (fun (k, _, lz) -> lz < root_layer && k < k0) 816 - with_layers 817 - |> List.map (fun (k, v, _) -> (k, v)) 818 - | [] -> 819 - [] 1060 + let left_group, on_layer, right_groups = 1061 + let left_group = ref [] in 1062 + let current_group = ref [] in 1063 + let on_layer_rev = ref [] in 1064 + let groups_rev = ref [] in 1065 + let seen_on = ref false in 1066 + List.iter 1067 + (fun (k, v, lz) -> 1068 + if lz = root_layer then ( 1069 + if not !seen_on then left_group := List.rev !current_group 1070 + else groups_rev := List.rev !current_group :: !groups_rev ; 1071 + current_group := [] ; 1072 + on_layer_rev := (k, v) :: !on_layer_rev ; 1073 + seen_on := true ) 1074 + else current_group := (k, v) :: !current_group ) 1075 + with_layers ; 1076 + let on_layer = List.rev !on_layer_rev in 1077 + let right_groups = 1078 + if not !seen_on then [] 1079 + else List.rev (List.rev !current_group :: !groups_rev) 1080 + in 1081 + (!left_group, on_layer, right_groups) 820 1082 in 821 1083 let%lwt l_cid = 822 1084 match left_group with ··· 838 1100 let%lwt c = wrap cid child_layer in 839 1101 Lwt.return_some c 840 1102 in 841 - (* compute right groups aligned to on-layer entries *) 842 - let rec right_groups acc rest = 843 - match rest with 844 - | [] -> 845 - List.rev acc 846 - | (k, _) :: tl -> 847 - let upper = 848 - match tl with (k2, _) :: _ -> Some k2 | [] -> None 849 - in 850 - let grp = 851 - List.filter 852 - (fun (k', _, lz) -> 853 - lz < root_layer && k' > k 854 - && match upper with Some ku -> k' < ku | None -> true ) 855 - with_layers 856 - |> List.map (fun (k', v', _) -> (k', v')) 857 - in 858 - right_groups ((k, grp) :: acc) tl 1103 + let rights = 1104 + List.map2 (fun (k, _) grp -> (k, grp)) on_layer right_groups 859 1105 in 860 - let rights = right_groups [] on_layer in 861 1106 let%lwt t_links = 862 1107 Lwt_list.map_s 863 1108 (fun (_k, grp) ->
-11
mist/lib/util.ml
··· 48 48 49 49 let rec last (lst : 'a list) : 'a option = 50 50 match lst with [] -> None | [x] -> Some x | _ :: xs -> last xs 51 - 52 - let at_index i (lst : 'a list) : 'a option = 53 - let rec aux j = function 54 - | [] -> 55 - None 56 - | [x] -> 57 - Some x 58 - | x :: xs -> 59 - if j = 0 then Some x else aux (j - 1) xs 60 - in 61 - aux i lst
+1 -1
pegasus/bench/bench_repository.ml
··· 406 406 let bench_db_io_patterns () = 407 407 print_header "database i/o" ; 408 408 let%lwt db, path = setup_test_db () in 409 - let size = 100000 in 409 + let size = 20000 in 410 410 let blocks = generate_blocks size in 411 411 let%lwt () = 412 412 Util.use_pool db.db (fun conn -> User_store.Bulk.put_blocks blocks conn) >|= fun _ -> ()
+29 -37
pegasus/lib/repository.ml
··· 221 221 ref (Cached_mst.create cached_store prev_commit.data) 222 222 in 223 223 (* ops to emit, built in loop because prev_data (previous cid) is otherwise inaccessible *) 224 - let commit_ops : commit_evt_op list ref = ref [] in 224 + let commit_ops_rev : commit_evt_op list ref = ref [] in 225 225 let added_leaves = ref Block_map.empty in 226 226 let%lwt results = 227 227 Lwt_list.map_s ··· 250 250 User_store.put_record t.db (`LexMap record_with_type) path 251 251 in 252 252 added_leaves := Block_map.set cid block !added_leaves ; 253 - commit_ops := 254 - !commit_ops @ [{action= `Create; path; cid= Some cid; prev= None}] ; 253 + commit_ops_rev := 254 + {action= `Create; path; cid= Some cid; prev= None} 255 + :: !commit_ops_rev ; 255 256 let%lwt new_mst = Cached_mst.add !mst path cid in 256 257 mst := new_mst ; 257 258 let refs = ··· 272 273 | Update {collection; rkey; value; swap_record; _} -> 273 274 let path = Format.sprintf "%s/%s" collection rkey in 274 275 let uri = Format.sprintf "at://%s/%s" t.did path in 275 - let%lwt old_cid = User_store.get_record_cid t.db path in 276 + let%lwt existing_record = User_store.get_record t.db path in 277 + let old_cid = Option.map (fun (r : record) -> r.cid) existing_record in 276 278 ( if 277 279 (swap_record <> None && swap_record <> old_cid) 278 280 || (swap_record = None && old_cid = None) ··· 288 290 (Format.sprintf "attempted to update record %s with cid %s" 289 291 path cid_str ) ) ; 290 292 let%lwt () = 291 - match old_cid with 292 - | Some _ -> ( 293 - match%lwt User_store.get_record t.db path with 294 - | Some record -> 295 - let refs = 296 - Util.find_blob_refs record.value 297 - |> List.map (fun (r : Mist.Blob_ref.t) -> r.ref) 293 + match existing_record with 294 + | Some record -> 295 + let refs = 296 + Util.find_blob_refs record.value 297 + |> List.map (fun (r : Mist.Blob_ref.t) -> r.ref) 298 + in 299 + if not (List.is_empty refs) then 300 + let%lwt _ = 301 + User_store.delete_orphaned_blobs_by_record_path t.db path 298 302 in 299 - if not (List.is_empty refs) then 300 - let%lwt _ = 301 - User_store.delete_orphaned_blobs_by_record_path t.db 302 - path 303 - in 304 - Lwt.return_unit 305 - else Lwt.return_unit 306 - | None -> 307 - Lwt.return_unit ) 303 + Lwt.return_unit 304 + else Lwt.return_unit 308 305 | None -> 309 306 Lwt.return_unit 310 307 in ··· 316 313 User_store.put_record t.db (`LexMap record_with_type) path 317 314 in 318 315 added_leaves := Block_map.set new_cid new_block !added_leaves ; 319 - commit_ops := 320 - !commit_ops 321 - @ [{action= `Update; path; cid= Some new_cid; prev= old_cid}] ; 316 + commit_ops_rev := 317 + {action= `Update; path; cid= Some new_cid; prev= old_cid} 318 + :: !commit_ops_rev ; 322 319 let%lwt new_mst = Cached_mst.add !mst path new_cid in 323 320 mst := new_mst ; 324 321 let refs = ··· 339 336 ; cid= new_cid } ) 340 337 | Delete {collection; rkey; swap_record; _} -> 341 338 let path = Format.sprintf "%s/%s" collection rkey in 342 - let%lwt cid = User_store.get_record_cid t.db path in 339 + let%lwt existing_record = User_store.get_record t.db path in 340 + let cid = Option.map (fun (r : record) -> r.cid) existing_record in 343 341 ( if cid = None || (swap_record <> None && swap_record <> cid) then 344 342 let cid_str = 345 343 match cid with ··· 352 350 (Format.sprintf "attempted to delete record %s with cid %s" 353 351 path cid_str ) ) ; 354 352 let%lwt () = 355 - match%lwt User_store.get_record t.db path with 353 + match existing_record with 356 354 | Some record -> 357 355 let refs = 358 356 Util.find_blob_refs record.value ··· 368 366 Lwt.return_unit 369 367 in 370 368 let%lwt () = User_store.delete_record t.db path in 371 - commit_ops := 372 - !commit_ops @ [{action= `Delete; path; cid= None; prev= cid}] ; 369 + commit_ops_rev := 370 + {action= `Delete; path; cid= None; prev= cid} :: !commit_ops_rev ; 373 371 let%lwt new_mst = Cached_mst.delete !mst path in 374 372 mst := new_mst ; 375 373 Lwt.return 376 374 (Delete {type'= "com.atproto.repo.applyWrites#deleteResult"}) ) 377 375 writes 378 376 in 377 + let commit_ops = List.rev !commit_ops_rev in 379 378 let new_mst = !mst in 380 379 (* flush all writes, ensuring all blocks are written or none are *) 381 380 let%lwt () = ··· 390 389 let commit_block = 391 390 new_commit_signed |> signed_commit_to_yojson |> Dag_cbor.encode_yojson 392 391 in 393 - let%lwt proof_blocks = 394 - Lwt_list.fold_left_s 395 - (fun acc ({path; _} : commit_evt_op) -> 396 - let%lwt key_proof = 397 - Cached_mst.proof_for_key new_mst new_mst.root path 398 - in 399 - Lwt.return (Block_map.merge acc key_proof) ) 400 - Block_map.empty !commit_ops 401 - in 392 + let proof_keys = List.map (fun ({path; _} : commit_evt_op) -> path) commit_ops in 393 + let%lwt proof_blocks = Cached_mst.proof_for_keys new_mst new_mst.root proof_keys in 402 394 let proof_blocks = Block_map.merge proof_blocks !added_leaves in 403 395 let block_stream = 404 396 proof_blocks |> Block_map.entries |> Lwt_seq.of_list ··· 410 402 let%lwt ds = Data_store.connect () in 411 403 let%lwt _ = 412 404 Sequencer.sequence_commit ds ~did:t.did ~commit:new_commit_cid 413 - ~rev:new_commit_signed.rev ~blocks ~ops:!commit_ops ~since:prev_commit.rev 405 + ~rev:new_commit_signed.rev ~blocks ~ops:commit_ops ~since:prev_commit.rev 414 406 ~prev_data:prev_commit.data () 415 407 in 416 408 Lwt.return {commit= new_commit; results}
+156 -149
pegasus/lib/user_store.ml
··· 314 314 ~path ~cids 315 315 end 316 316 317 + module Bulk = struct 318 + open struct 319 + let escape_sql_string s = Str.global_replace (Str.regexp "'") "''" s 320 + 321 + let bytes_to_hex data = 322 + let buf = Buffer.create (Bytes.length data * 2) in 323 + Bytes.iter 324 + (fun c -> Buffer.add_string buf (Printf.sprintf "%02x" (Char.code c))) 325 + data ; 326 + Buffer.contents buf 327 + 328 + let chunk_list n lst = 329 + if n <= 0 then invalid_arg "negative n passed to chunk_list" ; 330 + let rec take_n acc remaining xs = 331 + match (remaining, xs) with 332 + | _, [] -> 333 + (List.rev acc, []) 334 + | 0, rest -> 335 + (List.rev acc, rest) 336 + | _, x :: xs' -> 337 + take_n (x :: acc) (remaining - 1) xs' 338 + in 339 + let rec go xs = 340 + match xs with 341 + | [] -> 342 + [] 343 + | _ -> 344 + let chunk, rest = take_n [] n xs in 345 + chunk :: go rest 346 + in 347 + go lst 348 + end 349 + 350 + let put_blocks (blocks : (Cid.t * bytes) list) conn = 351 + if List.is_empty blocks then Lwt.return_ok () 352 + else 353 + let module C = (val conn : Caqti_lwt.CONNECTION) in 354 + let chunks = chunk_list 200 blocks in 355 + let rec process_chunks = function 356 + | [] -> 357 + Lwt.return_ok () 358 + | chunk :: rest -> ( 359 + let values = 360 + List.map 361 + (fun (cid, data) -> 362 + let cid_str = escape_sql_string (Cid.to_string cid) in 363 + let hex_data = bytes_to_hex data in 364 + Printf.sprintf "('%s', CAST(X'%s' AS TEXT))" cid_str hex_data ) 365 + chunk 366 + |> String.concat ", " 367 + in 368 + let sql = 369 + Printf.sprintf 370 + "INSERT INTO mst (cid, data) VALUES %s ON CONFLICT DO NOTHING" 371 + values 372 + in 373 + let query = 374 + Caqti_request.Infix.( ->. ) Caqti_type.unit Caqti_type.unit sql 375 + in 376 + let%lwt result = C.exec query () in 377 + match result with 378 + | Ok () -> 379 + process_chunks rest 380 + | Error e -> 381 + Lwt.return_error e ) 382 + in 383 + process_chunks chunks 384 + 385 + let put_records (records : (string * Cid.t * bytes * string) list) conn = 386 + if List.is_empty records then Lwt.return_ok () 387 + else 388 + let module C = (val conn : Caqti_lwt.CONNECTION) in 389 + let chunks = chunk_list 100 records in 390 + let rec process_chunks = function 391 + | [] -> 392 + Lwt.return_ok () 393 + | chunk :: rest -> ( 394 + let values = 395 + List.map 396 + (fun (path, cid, data, since) -> 397 + let hex_data = bytes_to_hex data in 398 + Printf.sprintf "('%s', '%s', CAST(X'%s' AS TEXT), '%s')" 399 + (escape_sql_string path) 400 + (escape_sql_string (Cid.to_string cid)) 401 + hex_data (escape_sql_string since) ) 402 + chunk 403 + |> String.concat ", " 404 + in 405 + let sql = 406 + Printf.sprintf 407 + "INSERT INTO records (path, cid, data, since) VALUES %s ON \ 408 + CONFLICT (path) DO UPDATE SET cid = excluded.cid, data = \ 409 + excluded.data, since = excluded.since" 410 + values 411 + in 412 + let query = 413 + Caqti_request.Infix.( ->. ) Caqti_type.unit Caqti_type.unit sql 414 + in 415 + let%lwt result = C.exec query () in 416 + match result with 417 + | Ok () -> 418 + process_chunks rest 419 + | Error e -> 420 + Lwt.return_error e ) 421 + in 422 + process_chunks chunks 423 + 424 + let put_blob_refs (refs : (string * Cid.t) list) conn = 425 + if List.is_empty refs then Lwt.return_ok () 426 + else 427 + let module C = (val conn : Caqti_lwt.CONNECTION) in 428 + let chunks = chunk_list 200 refs in 429 + let rec process_chunks = function 430 + | [] -> 431 + Lwt.return_ok () 432 + | chunk :: rest -> ( 433 + let values = 434 + List.map 435 + (fun (path, cid) -> 436 + Printf.sprintf "('%s', '%s')" (escape_sql_string path) 437 + (escape_sql_string (Cid.to_string cid)) ) 438 + chunk 439 + |> String.concat ", " 440 + in 441 + let sql = 442 + Printf.sprintf 443 + "INSERT INTO blobs_records (record_path, blob_cid) VALUES %s \ 444 + ON CONFLICT DO NOTHING" 445 + values 446 + in 447 + let query = 448 + Caqti_request.Infix.( ->. ) Caqti_type.unit Caqti_type.unit sql 449 + in 450 + let%lwt result = C.exec query () in 451 + match result with 452 + | Ok () -> 453 + process_chunks rest 454 + | Error e -> 455 + Lwt.return_error e ) 456 + in 457 + process_chunks chunks 458 + end 459 + 317 460 type t = {did: string; db: Util.caqti_pool} 318 461 319 462 let pool_cache : (string, t) Hashtbl.t = Hashtbl.create 64 ··· 351 494 Lwt.return ({blocks= Block_map.empty; missing= []} : Block_map.with_missing) 352 495 else 353 496 let%lwt blocks = Util.use_pool t.db @@ Queries.get_blocks cids in 497 + let found_map = 498 + List.fold_left 499 + (fun acc ({cid; data} : block) -> Block_map.set cid data acc) 500 + Block_map.empty blocks 501 + in 354 502 Lwt.return 355 503 (List.fold_left 356 504 (fun (acc : Block_map.with_missing) cid -> 357 - match List.find_opt (fun (b : block) -> b.cid = cid) blocks with 358 - | Some {data; _} -> 505 + match Block_map.get cid found_map with 506 + | Some data -> 359 507 {acc with blocks= Block_map.set cid data acc.blocks} 360 508 | None -> 361 509 {acc with missing= cid :: acc.missing} ) ··· 376 524 Lwt.return false 377 525 378 526 let put_many t bm : (int, exn) Lwt_result.t = 379 - Util.multi_query t.db 380 - (List.map 381 - (fun (cid, block) -> Queries.put_block cid block) 382 - (Block_map.entries bm) ) 527 + let entries = Block_map.entries bm in 528 + if List.is_empty entries then Lwt.return_ok 0 529 + else 530 + Lwt_result.catch (fun () -> 531 + let%lwt () = Util.use_pool t.db (fun conn -> Bulk.put_blocks entries conn) in 532 + Lwt.return (List.length entries) ) 383 533 384 534 let delete_block t cid : (bool, exn) Lwt_result.t = 385 535 Lwt_result.catch ··· 569 719 let storage_str = Blob_store.storage_to_string storage in 570 720 Util.use_pool t.db 571 721 @@ Queries.list_blobs_by_storage ~storage:storage_str ~limit ~cursor 572 - 573 - module Bulk = struct 574 - open struct 575 - let escape_sql_string s = Str.global_replace (Str.regexp "'") "''" s 576 - 577 - let bytes_to_hex data = 578 - let buf = Buffer.create (Bytes.length data * 2) in 579 - Bytes.iter 580 - (fun c -> Buffer.add_string buf (Printf.sprintf "%02x" (Char.code c))) 581 - data ; 582 - Buffer.contents buf 583 - 584 - let chunk_list n lst = 585 - if n <= 0 then invalid_arg "negative n passed to chunk_list" ; 586 - let rec take_n acc remaining xs = 587 - match (remaining, xs) with 588 - | _, [] -> 589 - (List.rev acc, []) 590 - | 0, rest -> 591 - (List.rev acc, rest) 592 - | _, x :: xs' -> 593 - take_n (x :: acc) (remaining - 1) xs' 594 - in 595 - let rec go xs = 596 - match xs with 597 - | [] -> 598 - [] 599 - | _ -> 600 - let chunk, rest = take_n [] n xs in 601 - chunk :: go rest 602 - in 603 - go lst 604 - end 605 - 606 - let put_blocks (blocks : (Cid.t * bytes) list) conn = 607 - if List.is_empty blocks then Lwt.return_ok () 608 - else 609 - let module C = (val conn : Caqti_lwt.CONNECTION) in 610 - let chunks = chunk_list 200 blocks in 611 - let rec process_chunks = function 612 - | [] -> 613 - Lwt.return_ok () 614 - | chunk :: rest -> ( 615 - let values = 616 - List.map 617 - (fun (cid, data) -> 618 - let cid_str = escape_sql_string (Cid.to_string cid) in 619 - let hex_data = bytes_to_hex data in 620 - Printf.sprintf "('%s', CAST(X'%s' AS TEXT))" cid_str hex_data ) 621 - chunk 622 - |> String.concat ", " 623 - in 624 - let sql = 625 - Printf.sprintf 626 - "INSERT INTO mst (cid, data) VALUES %s ON CONFLICT DO NOTHING" 627 - values 628 - in 629 - let query = 630 - Caqti_request.Infix.( ->. ) Caqti_type.unit Caqti_type.unit sql 631 - in 632 - let%lwt result = C.exec query () in 633 - match result with 634 - | Ok () -> 635 - process_chunks rest 636 - | Error e -> 637 - Lwt.return_error e ) 638 - in 639 - process_chunks chunks 640 - 641 - let put_records (records : (string * Cid.t * bytes * string) list) conn = 642 - if List.is_empty records then Lwt.return_ok () 643 - else 644 - let module C = (val conn : Caqti_lwt.CONNECTION) in 645 - let chunks = chunk_list 100 records in 646 - let rec process_chunks = function 647 - | [] -> 648 - Lwt.return_ok () 649 - | chunk :: rest -> ( 650 - let values = 651 - List.map 652 - (fun (path, cid, data, since) -> 653 - let hex_data = bytes_to_hex data in 654 - Printf.sprintf "('%s', '%s', CAST(X'%s' AS TEXT), '%s')" 655 - (escape_sql_string path) 656 - (escape_sql_string (Cid.to_string cid)) 657 - hex_data (escape_sql_string since) ) 658 - chunk 659 - |> String.concat ", " 660 - in 661 - let sql = 662 - Printf.sprintf 663 - "INSERT INTO records (path, cid, data, since) VALUES %s ON \ 664 - CONFLICT (path) DO UPDATE SET cid = excluded.cid, data = \ 665 - excluded.data, since = excluded.since" 666 - values 667 - in 668 - let query = 669 - Caqti_request.Infix.( ->. ) Caqti_type.unit Caqti_type.unit sql 670 - in 671 - let%lwt result = C.exec query () in 672 - match result with 673 - | Ok () -> 674 - process_chunks rest 675 - | Error e -> 676 - Lwt.return_error e ) 677 - in 678 - process_chunks chunks 679 - 680 - let put_blob_refs (refs : (string * Cid.t) list) conn = 681 - if List.is_empty refs then Lwt.return_ok () 682 - else 683 - let module C = (val conn : Caqti_lwt.CONNECTION) in 684 - let chunks = chunk_list 200 refs in 685 - let rec process_chunks = function 686 - | [] -> 687 - Lwt.return_ok () 688 - | chunk :: rest -> ( 689 - let values = 690 - List.map 691 - (fun (path, cid) -> 692 - Printf.sprintf "('%s', '%s')" (escape_sql_string path) 693 - (escape_sql_string (Cid.to_string cid)) ) 694 - chunk 695 - |> String.concat ", " 696 - in 697 - let sql = 698 - Printf.sprintf 699 - "INSERT INTO blobs_records (record_path, blob_cid) VALUES %s \ 700 - ON CONFLICT DO NOTHING" 701 - values 702 - in 703 - let query = 704 - Caqti_request.Infix.( ->. ) Caqti_type.unit Caqti_type.unit sql 705 - in 706 - let%lwt result = C.exec query () in 707 - match result with 708 - | Ok () -> 709 - process_chunks rest 710 - | Error e -> 711 - Lwt.return_error e ) 712 - in 713 - process_chunks chunks 714 - end