My personal data management layer

Port bushel to use typesense library with incremental sync and dryrun

- Rewrite bushel_typesense to use typed Typesense library APIs
- Use Typesense.Field.T.v and Typesense.CollectionSchema.T.v for schemas
- Use Jsont.Json module for document construction
- Add incremental sync that compares with existing documents
- Export existing document IDs from Typesense
- Only create/update changed documents
- Delete documents that no longer exist locally
- Implement proper dryrun mode that reports what would change
- Shows document IDs that would be created/deleted
- Reports counts per collection
- Add sync_stats tracking (created, updated, deleted, unchanged, errors)
- Update bushel_sync.ml to use new Typesense_auth.Client API

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+498 -242
+30 -12
lib_sync/bushel_sync.ml
··· 364 364 365 365 (** {1 Typesense Upload} *) 366 366 367 - let upload_typesense ~dry_run config _entries = 368 - Log.info (fun m -> m "Uploading to Typesense..."); 367 + let upload_typesense ~dry_run ~sw ~env config entries = 368 + Log.info (fun m -> m "%s Typesense..." (if dry_run then "Checking" else "Syncing")); 369 369 370 370 match Bushel_config.typesense_api_key config with 371 371 | Error e -> 372 372 { step = Typesense; success = false; 373 373 message = "Missing Typesense API key"; 374 374 details = [e] } 375 - | Ok _api_key -> 376 - if dry_run then 377 - { step = Typesense; success = true; 378 - message = "Would upload to Typesense"; 379 - details = ["POST to Typesense API (not yet implemented)"] } 380 - else 381 - (* TODO: Implement actual Typesense upload using bushel-typesense *) 382 - { step = Typesense; success = true; 383 - message = "Typesense upload (not yet implemented)"; 375 + | Ok api_key -> 376 + try 377 + (* Create Typesense client *) 378 + let client = Typesense_auth.Client.login ~sw ~env 379 + ~server_url:config.typesense_endpoint 380 + ~api_key 381 + () in 382 + 383 + (* Run incremental sync *) 384 + let result = Bushel_typesense.sync ~dry_run ~client ~entries in 385 + 386 + (* Format details from each collection *) 387 + let details = List.concat_map (fun (r : Bushel_typesense.collection_sync_result) -> 388 + let stats = r.stats in 389 + let summary = Printf.sprintf "%s: %d created, %d updated, %d deleted" 390 + r.collection stats.created stats.updated stats.deleted in 391 + summary :: r.details 392 + ) result.collections in 393 + 394 + { step = Typesense; success = result.total_errors = 0; 395 + message = Printf.sprintf "%s: %d created, %d updated, %d deleted, %d errors" 396 + (if dry_run then "Would sync" else "Synced") 397 + result.total_created result.total_updated result.total_deleted result.total_errors; 398 + details } 399 + with e -> 400 + { step = Typesense; success = false; 401 + message = Printf.sprintf "Typesense sync failed: %s" (Printexc.to_string e); 384 402 details = [] } 385 403 386 404 (** {1 Run Pipeline} *) ··· 401 419 | Thumbs -> generate_paper_thumbnails ~dry_run ~fs ~proc_mgr config 402 420 | Faces -> sync_faces ~dry_run ~fs config entries 403 421 | Videos -> sync_video_thumbnails ~dry_run ~http config entries 404 - | Typesense -> upload_typesense ~dry_run config entries 422 + | Typesense -> upload_typesense ~dry_run ~sw ~env config entries 405 423 ) steps in 406 424 407 425 (* Summary *)
+4 -1
lib_sync/dune
··· 4 4 (libraries 5 5 bushel 6 6 bushel.config 7 + bushel.typesense 7 8 eio 8 9 unix 9 10 jsont ··· 18 19 sortal 19 20 srcsetter-cmd 20 21 requests 21 - peertube)) 22 + peertube 23 + typesense 24 + typesense.auth))
+463 -228
lib_typesense/bushel_typesense.ml
··· 5 5 6 6 (** Typesense search integration for Bushel entries *) 7 7 8 - (** {1 Schema Definitions} *) 8 + let src = Logs.Src.create "bushel.typesense" ~doc:"Bushel Typesense sync" 9 + module Log = (val Logs.src_log src : Logs.LOG) 9 10 10 - let field name type_ ?(facet=false) ?(optional=false) () = 11 - let fields = [ 12 - ("name", `String name); 13 - ("type", `String type_); 14 - ] in 15 - let fields = if facet then ("facet", `Bool true) :: fields else fields in 16 - let fields = if optional then ("optional", `Bool true) :: fields else fields in 17 - `O fields 11 + (** {1 Schema Definitions using Typesense library} *) 12 + 13 + let field ~name ~type_ ?facet ?optional () = 14 + Typesense.Field.T.v ~name ~type_ 15 + ?facet ?optional () 18 16 19 17 let notes_schema = 20 - `O [ 21 - ("name", `String "notes"); 22 - ("fields", `A [ 23 - field "id" "string" (); 24 - field "title" "string" (); 25 - field "content" "string" (); 26 - field "date" "string" (); 27 - field "date_timestamp" "int64" (); 28 - field "tags" "string[]" ~facet:true (); 29 - field "body" "string" ~optional:true (); 30 - field "draft" "bool" (); 31 - field "synopsis" "string[]" ~optional:true (); 32 - field "thumbnail_url" "string" ~optional:true (); 33 - field "type" "string" ~facet:true ~optional:true (); 34 - field "status" "string" ~facet:true ~optional:true (); 35 - field "related_papers" "string[]" ~optional:true (); 36 - field "related_projects" "string[]" ~optional:true (); 37 - field "related_contacts" "string[]" ~optional:true (); 38 - field "attachments" "string[]" ~optional:true (); 39 - field "source" "string" ~facet:true ~optional:true (); 40 - field "url" "string" ~optional:true (); 41 - field "author" "string" ~optional:true (); 42 - field "category" "string" ~facet:true ~optional:true (); 43 - field "slug_ent" "string" ~optional:true (); 44 - field "words" "int32" ~optional:true (); 45 - ]); 46 - ("default_sorting_field", `String "date_timestamp"); 47 - ] 18 + Typesense.CollectionSchema.T.v 19 + ~name:"notes" 20 + ~default_sorting_field:"date_timestamp" 21 + ~fields:[ 22 + field ~name:"id" ~type_:"string" (); 23 + field ~name:"title" ~type_:"string" (); 24 + field ~name:"content" ~type_:"string" (); 25 + field ~name:"date" ~type_:"string" (); 26 + field ~name:"date_timestamp" ~type_:"int64" (); 27 + field ~name:"tags" ~type_:"string[]" ~facet:true (); 28 + field ~name:"body" ~type_:"string" ~optional:true (); 29 + field ~name:"draft" ~type_:"bool" (); 30 + field ~name:"synopsis" ~type_:"string[]" ~optional:true (); 31 + field ~name:"thumbnail_url" ~type_:"string" ~optional:true (); 32 + field ~name:"type" ~type_:"string" ~facet:true ~optional:true (); 33 + field ~name:"status" ~type_:"string" ~facet:true ~optional:true (); 34 + field ~name:"related_papers" ~type_:"string[]" ~optional:true (); 35 + field ~name:"related_projects" ~type_:"string[]" ~optional:true (); 36 + field ~name:"related_contacts" ~type_:"string[]" ~optional:true (); 37 + field ~name:"attachments" ~type_:"string[]" ~optional:true (); 38 + field ~name:"source" ~type_:"string" ~facet:true ~optional:true (); 39 + field ~name:"url" ~type_:"string" ~optional:true (); 40 + field ~name:"author" ~type_:"string" ~optional:true (); 41 + field ~name:"category" ~type_:"string" ~facet:true ~optional:true (); 42 + field ~name:"slug_ent" ~type_:"string" ~optional:true (); 43 + field ~name:"words" ~type_:"int32" ~optional:true (); 44 + ] () 48 45 49 46 let papers_schema = 50 - `O [ 51 - ("name", `String "papers"); 52 - ("fields", `A [ 53 - field "id" "string" (); 54 - field "title" "string" (); 55 - field "authors" "string[]" (); 56 - field "abstract" "string" (); 57 - field "date" "string" (); 58 - field "date_timestamp" "int64" (); 59 - field "tags" "string[]" ~facet:true (); 60 - field "doi" "string[]" ~optional:true (); 61 - field "arxiv_id" "string" ~optional:true (); 62 - field "pdf_url" "string[]" ~optional:true (); 63 - field "thumbnail_url" "string" ~optional:true (); 64 - field "journal" "string[]" ~optional:true (); 65 - field "related_projects" "string[]" ~optional:true (); 66 - field "related_talks" "string[]" ~optional:true (); 67 - ]); 68 - ("default_sorting_field", `String "date_timestamp"); 69 - ] 47 + Typesense.CollectionSchema.T.v 48 + ~name:"papers" 49 + ~default_sorting_field:"date_timestamp" 50 + ~fields:[ 51 + field ~name:"id" ~type_:"string" (); 52 + field ~name:"title" ~type_:"string" (); 53 + field ~name:"authors" ~type_:"string[]" (); 54 + field ~name:"abstract" ~type_:"string" (); 55 + field ~name:"date" ~type_:"string" (); 56 + field ~name:"date_timestamp" ~type_:"int64" (); 57 + field ~name:"tags" ~type_:"string[]" ~facet:true (); 58 + field ~name:"doi" ~type_:"string[]" ~optional:true (); 59 + field ~name:"arxiv_id" ~type_:"string" ~optional:true (); 60 + field ~name:"pdf_url" ~type_:"string[]" ~optional:true (); 61 + field ~name:"thumbnail_url" ~type_:"string" ~optional:true (); 62 + field ~name:"journal" ~type_:"string[]" ~optional:true (); 63 + field ~name:"related_projects" ~type_:"string[]" ~optional:true (); 64 + field ~name:"related_talks" ~type_:"string[]" ~optional:true (); 65 + ] () 70 66 71 67 let projects_schema = 72 - `O [ 73 - ("name", `String "projects"); 74 - ("fields", `A [ 75 - field "id" "string" (); 76 - field "title" "string" (); 77 - field "description" "string" (); 78 - field "start_year" "int32" (); 79 - field "finish_year" "int32" ~optional:true (); 80 - field "date" "string" (); 81 - field "date_timestamp" "int64" (); 82 - field "tags" "string[]" ~facet:true (); 83 - field "repository_url" "string" ~optional:true (); 84 - field "homepage_url" "string" ~optional:true (); 85 - field "languages" "string[]" ~facet:true ~optional:true (); 86 - field "license" "string" ~facet:true ~optional:true (); 87 - field "status" "string" ~facet:true ~optional:true (); 88 - field "related_papers" "string[]" ~optional:true (); 89 - field "related_talks" "string[]" ~optional:true (); 90 - field "body" "string" ~optional:true (); 91 - field "ideas" "string" ~optional:true (); 92 - ]); 93 - ("default_sorting_field", `String "date_timestamp"); 94 - ] 68 + Typesense.CollectionSchema.T.v 69 + ~name:"projects" 70 + ~default_sorting_field:"date_timestamp" 71 + ~fields:[ 72 + field ~name:"id" ~type_:"string" (); 73 + field ~name:"title" ~type_:"string" (); 74 + field ~name:"description" ~type_:"string" (); 75 + field ~name:"start_year" ~type_:"int32" (); 76 + field ~name:"finish_year" ~type_:"int32" ~optional:true (); 77 + field ~name:"date" ~type_:"string" (); 78 + field ~name:"date_timestamp" ~type_:"int64" (); 79 + field ~name:"tags" ~type_:"string[]" ~facet:true (); 80 + field ~name:"repository_url" ~type_:"string" ~optional:true (); 81 + field ~name:"homepage_url" ~type_:"string" ~optional:true (); 82 + field ~name:"languages" ~type_:"string[]" ~facet:true ~optional:true (); 83 + field ~name:"license" ~type_:"string" ~facet:true ~optional:true (); 84 + field ~name:"status" ~type_:"string" ~facet:true ~optional:true (); 85 + field ~name:"related_papers" ~type_:"string[]" ~optional:true (); 86 + field ~name:"related_talks" ~type_:"string[]" ~optional:true (); 87 + field ~name:"body" ~type_:"string" ~optional:true (); 88 + field ~name:"ideas" ~type_:"string" ~optional:true (); 89 + ] () 95 90 96 91 let ideas_schema = 97 - `O [ 98 - ("name", `String "ideas"); 99 - ("fields", `A [ 100 - field "id" "string" (); 101 - field "title" "string" (); 102 - field "description" "string" (); 103 - field "year" "int32" (); 104 - field "date" "string" (); 105 - field "date_timestamp" "int64" (); 106 - field "tags" "string[]" ~facet:true (); 107 - field "level" "string" ~facet:true (); 108 - field "status" "string" ~facet:true (); 109 - field "project" "string" ~facet:true (); 110 - field "supervisors" "string[]" ~optional:true (); 111 - field "body" "string" ~optional:true (); 112 - field "students" "string[]" ~optional:true (); 113 - field "reading" "string" ~optional:true (); 114 - field "url" "string" ~optional:true (); 115 - ]); 116 - ("default_sorting_field", `String "date_timestamp"); 117 - ] 92 + Typesense.CollectionSchema.T.v 93 + ~name:"ideas" 94 + ~default_sorting_field:"date_timestamp" 95 + ~fields:[ 96 + field ~name:"id" ~type_:"string" (); 97 + field ~name:"title" ~type_:"string" (); 98 + field ~name:"description" ~type_:"string" (); 99 + field ~name:"year" ~type_:"int32" (); 100 + field ~name:"date" ~type_:"string" (); 101 + field ~name:"date_timestamp" ~type_:"int64" (); 102 + field ~name:"tags" ~type_:"string[]" ~facet:true (); 103 + field ~name:"level" ~type_:"string" ~facet:true (); 104 + field ~name:"status" ~type_:"string" ~facet:true (); 105 + field ~name:"project" ~type_:"string" ~facet:true (); 106 + field ~name:"supervisors" ~type_:"string[]" ~optional:true (); 107 + field ~name:"body" ~type_:"string" ~optional:true (); 108 + field ~name:"students" ~type_:"string[]" ~optional:true (); 109 + field ~name:"reading" ~type_:"string" ~optional:true (); 110 + field ~name:"url" ~type_:"string" ~optional:true (); 111 + ] () 118 112 119 113 let videos_schema = 120 - `O [ 121 - ("name", `String "videos"); 122 - ("fields", `A [ 123 - field "id" "string" (); 124 - field "title" "string" (); 125 - field "description" "string" (); 126 - field "published_date" "string" (); 127 - field "date" "string" (); 128 - field "date_timestamp" "int64" (); 129 - field "tags" "string[]" ~facet:true (); 130 - field "url" "string" (); 131 - field "uuid" "string" (); 132 - field "is_talk" "bool" (); 133 - field "paper" "string[]" ~optional:true (); 134 - field "project" "string[]" ~optional:true (); 135 - field "video_url" "string" ~optional:true (); 136 - field "embed_url" "string" ~optional:true (); 137 - field "duration" "int32" ~optional:true (); 138 - field "channel" "string" ~facet:true ~optional:true (); 139 - field "platform" "string" ~facet:true ~optional:true (); 140 - field "views" "int32" ~optional:true (); 141 - field "related_papers" "string[]" ~optional:true (); 142 - field "related_talks" "string[]" ~optional:true (); 143 - ]); 144 - ("default_sorting_field", `String "date_timestamp"); 145 - ] 114 + Typesense.CollectionSchema.T.v 115 + ~name:"videos" 116 + ~default_sorting_field:"date_timestamp" 117 + ~fields:[ 118 + field ~name:"id" ~type_:"string" (); 119 + field ~name:"title" ~type_:"string" (); 120 + field ~name:"description" ~type_:"string" (); 121 + field ~name:"published_date" ~type_:"string" (); 122 + field ~name:"date" ~type_:"string" (); 123 + field ~name:"date_timestamp" ~type_:"int64" (); 124 + field ~name:"tags" ~type_:"string[]" ~facet:true (); 125 + field ~name:"url" ~type_:"string" (); 126 + field ~name:"uuid" ~type_:"string" (); 127 + field ~name:"is_talk" ~type_:"bool" (); 128 + field ~name:"paper" ~type_:"string[]" ~optional:true (); 129 + field ~name:"project" ~type_:"string[]" ~optional:true (); 130 + field ~name:"video_url" ~type_:"string" ~optional:true (); 131 + field ~name:"embed_url" ~type_:"string" ~optional:true (); 132 + field ~name:"duration" ~type_:"int32" ~optional:true (); 133 + field ~name:"channel" ~type_:"string" ~facet:true ~optional:true (); 134 + field ~name:"platform" ~type_:"string" ~facet:true ~optional:true (); 135 + field ~name:"views" ~type_:"int32" ~optional:true (); 136 + field ~name:"related_papers" ~type_:"string[]" ~optional:true (); 137 + field ~name:"related_talks" ~type_:"string[]" ~optional:true (); 138 + ] () 146 139 147 140 let contacts_schema = 148 - `O [ 149 - ("name", `String "contacts"); 150 - ("fields", `A [ 151 - field "id" "string" (); 152 - field "handle" "string" (); 153 - field "name" "string" (); 154 - field "names" "string[]" ~optional:true (); 155 - field "email" "string[]" ~optional:true (); 156 - field "icon" "string[]" ~optional:true (); 157 - field "github" "string[]" ~optional:true (); 158 - field "twitter" "string[]" ~optional:true (); 159 - field "bluesky" "string[]" ~optional:true (); 160 - field "mastodon" "string[]" ~optional:true (); 161 - field "orcid" "string[]" ~optional:true (); 162 - field "url" "string[]" ~optional:true (); 163 - field "atom" "string[]" ~optional:true (); 164 - ]); 165 - ] 141 + Typesense.CollectionSchema.T.v 142 + ~name:"contacts" 143 + ~fields:[ 144 + field ~name:"id" ~type_:"string" (); 145 + field ~name:"handle" ~type_:"string" (); 146 + field ~name:"name" ~type_:"string" (); 147 + field ~name:"names" ~type_:"string[]" ~optional:true (); 148 + field ~name:"email" ~type_:"string[]" ~optional:true (); 149 + field ~name:"icon" ~type_:"string[]" ~optional:true (); 150 + field ~name:"github" ~type_:"string[]" ~optional:true (); 151 + field ~name:"twitter" ~type_:"string[]" ~optional:true (); 152 + field ~name:"bluesky" ~type_:"string[]" ~optional:true (); 153 + field ~name:"mastodon" ~type_:"string[]" ~optional:true (); 154 + field ~name:"orcid" ~type_:"string[]" ~optional:true (); 155 + field ~name:"url" ~type_:"string[]" ~optional:true (); 156 + field ~name:"atom" ~type_:"string[]" ~optional:true (); 157 + ] () 158 + 159 + (** All collection schemas *) 160 + let all_schemas = [ 161 + notes_schema; 162 + papers_schema; 163 + projects_schema; 164 + ideas_schema; 165 + videos_schema; 166 + contacts_schema; 167 + ] 166 168 167 169 (** {1 Document Conversion} *) 168 170 171 + module J = Jsont.Json 172 + 169 173 let ptime_to_timestamp t = 170 174 let span = Ptime.to_span t in 171 175 Int64.of_float (Ptime.Span.to_float_s span) ··· 175 179 | Some t -> ptime_to_timestamp t 176 180 | None -> 0L 177 181 182 + let mem k v = J.mem (J.name k) v 183 + let str s = J.string s 184 + let num f = J.number f 185 + let int64_ i = J.number (Int64.to_float i) 186 + let int_ i = J.number (Float.of_int i) 187 + let bool_ b = J.bool b 188 + let str_list l = J.list (List.map str l) 189 + 190 + let obj fields = J.object' (List.filter_map Fun.id fields) 191 + 192 + let opt_mem k = function 193 + | Some v -> Some (mem k v) 194 + | None -> None 195 + 178 196 let note_to_document (n : Bushel.Note.t) = 179 197 let date = Bushel.Note.date n in 180 198 let (y, m, d) = date in 181 - `O ([ 182 - ("id", `String (Bushel.Note.slug n)); 183 - ("title", `String (Bushel.Note.title n)); 184 - ("content", `String (Bushel.Note.body n)); 185 - ("date", `String (Printf.sprintf "%04d-%02d-%02d" y m d)); 186 - ("date_timestamp", `Float (Int64.to_float (date_to_timestamp date))); 187 - ("tags", `A (List.map (fun t -> `String t) (Bushel.Note.tags n))); 188 - ("draft", `Bool (Bushel.Note.draft n)); 189 - ("words", `Float (Float.of_int (Bushel.Note.words n))); 190 - ] @ 191 - (match Bushel.Note.synopsis n with Some s -> [("synopsis", `A [`String s])] | None -> []) @ 192 - (match Bushel.Note.source n with Some s -> [("source", `String s)] | None -> []) @ 193 - (match Bushel.Note.url n with Some u -> [("url", `String u)] | None -> []) @ 194 - (match Bushel.Note.author n with Some a -> [("author", `String a)] | None -> []) @ 195 - (match Bushel.Note.category n with Some c -> [("category", `String c)] | None -> []) @ 196 - (match Bushel.Note.slug_ent n with Some s -> [("slug_ent", `String s)] | None -> [])) 199 + obj [ 200 + Some (mem "id" (str (Bushel.Note.slug n))); 201 + Some (mem "title" (str (Bushel.Note.title n))); 202 + Some (mem "content" (str (Bushel.Note.body n))); 203 + Some (mem "date" (str (Printf.sprintf "%04d-%02d-%02d" y m d))); 204 + Some (mem "date_timestamp" (int64_ (date_to_timestamp date))); 205 + Some (mem "tags" (str_list (Bushel.Note.tags n))); 206 + Some (mem "draft" (bool_ (Bushel.Note.draft n))); 207 + Some (mem "words" (int_ (Bushel.Note.words n))); 208 + opt_mem "synopsis" (Option.map (fun s -> str_list [s]) (Bushel.Note.synopsis n)); 209 + opt_mem "source" (Option.map str (Bushel.Note.source n)); 210 + opt_mem "url" (Option.map str (Bushel.Note.url n)); 211 + opt_mem "author" (Option.map str (Bushel.Note.author n)); 212 + opt_mem "category" (Option.map str (Bushel.Note.category n)); 213 + opt_mem "slug_ent" (Option.map str (Bushel.Note.slug_ent n)); 214 + ] 197 215 198 216 let paper_to_document (p : Bushel.Paper.t) = 199 217 let date = Bushel.Paper.date p in 200 218 let (y, m, d) = date in 201 - `O ([ 202 - ("id", `String (Bushel.Paper.slug p)); 203 - ("title", `String (Bushel.Paper.title p)); 204 - ("authors", `A (List.map (fun a -> `String a) (Bushel.Paper.authors p))); 205 - ("abstract", `String (Bushel.Paper.abstract p)); 206 - ("date", `String (Printf.sprintf "%04d-%02d-%02d" y m d)); 207 - ("date_timestamp", `Float (Int64.to_float (date_to_timestamp date))); 208 - ("tags", `A (List.map (fun t -> `String t) (Bushel.Paper.tags p))); 209 - ] @ 210 - (match Bushel.Paper.doi p with Some d -> [("doi", `A [`String d])] | None -> []) @ 211 - (match Bushel.Paper.url p with Some u -> [("pdf_url", `A [`String u])] | None -> []) @ 212 - (if Bushel.Paper.journal p <> "" then [("journal", `A [`String (Bushel.Paper.journal p)])] else [])) 219 + obj [ 220 + Some (mem "id" (str (Bushel.Paper.slug p))); 221 + Some (mem "title" (str (Bushel.Paper.title p))); 222 + Some (mem "authors" (str_list (Bushel.Paper.authors p))); 223 + Some (mem "abstract" (str (Bushel.Paper.abstract p))); 224 + Some (mem "date" (str (Printf.sprintf "%04d-%02d-%02d" y m d))); 225 + Some (mem "date_timestamp" (int64_ (date_to_timestamp date))); 226 + Some (mem "tags" (str_list (Bushel.Paper.tags p))); 227 + opt_mem "doi" (Option.map (fun d -> str_list [d]) (Bushel.Paper.doi p)); 228 + opt_mem "pdf_url" (Option.map (fun u -> str_list [u]) (Bushel.Paper.url p)); 229 + (let j = Bushel.Paper.journal p in if j <> "" then Some (mem "journal" (str_list [j])) else None); 230 + ] 213 231 214 232 let project_to_document (p : Bushel.Project.t) = 215 233 let date = (Bushel.Project.start p, 1, 1) in 216 234 let (y, m, d) = date in 217 - `O ([ 218 - ("id", `String (Bushel.Project.slug p)); 219 - ("title", `String (Bushel.Project.title p)); 220 - ("description", `String (Bushel.Project.body p)); 221 - ("start_year", `Float (Float.of_int (Bushel.Project.start p))); 222 - ("date", `String (Printf.sprintf "%04d-%02d-%02d" y m d)); 223 - ("date_timestamp", `Float (Int64.to_float (date_to_timestamp date))); 224 - ("tags", `A (List.map (fun t -> `String t) (Bushel.Project.tags p))); 225 - ("body", `String (Bushel.Project.body p)); 226 - ("ideas", `String (Bushel.Project.ideas p)); 227 - ] @ 228 - (match Bushel.Project.finish p with Some f -> [("finish_year", `Float (Float.of_int f))] | None -> [])) 235 + obj [ 236 + Some (mem "id" (str (Bushel.Project.slug p))); 237 + Some (mem "title" (str (Bushel.Project.title p))); 238 + Some (mem "description" (str (Bushel.Project.body p))); 239 + Some (mem "start_year" (int_ (Bushel.Project.start p))); 240 + Some (mem "date" (str (Printf.sprintf "%04d-%02d-%02d" y m d))); 241 + Some (mem "date_timestamp" (int64_ (date_to_timestamp date))); 242 + Some (mem "tags" (str_list (Bushel.Project.tags p))); 243 + Some (mem "body" (str (Bushel.Project.body p))); 244 + Some (mem "ideas" (str (Bushel.Project.ideas p))); 245 + opt_mem "finish_year" (Option.map int_ (Bushel.Project.finish p)); 246 + ] 229 247 230 248 let idea_to_document (i : Bushel.Idea.t) = 231 249 let date = (Bushel.Idea.year i, Bushel.Idea.month i, 1) in 232 250 let (y, m, d) = date in 233 - `O [ 234 - ("id", `String (Bushel.Idea.slug i)); 235 - ("title", `String (Bushel.Idea.title i)); 236 - ("description", `String (Bushel.Idea.body i)); 237 - ("year", `Float (Float.of_int (Bushel.Idea.year i))); 238 - ("date", `String (Printf.sprintf "%04d-%02d-%02d" y m d)); 239 - ("date_timestamp", `Float (Int64.to_float (date_to_timestamp date))); 240 - ("tags", `A (List.map (fun t -> `String t) (Bushel.Idea.tags i))); 241 - ("level", `String (Bushel.Idea.level_to_string (Bushel.Idea.level i))); 242 - ("status", `String (Bushel.Idea.status_to_string (Bushel.Idea.status i))); 243 - ("project", `String (Bushel.Idea.project i)); 244 - ("supervisors", `A (List.map (fun s -> `String s) (Bushel.Idea.supervisors i))); 245 - ("students", `A (List.map (fun s -> `String s) (Bushel.Idea.students i))); 246 - ("body", `String (Bushel.Idea.body i)); 247 - ("reading", `String (Bushel.Idea.reading i)); 251 + obj [ 252 + Some (mem "id" (str (Bushel.Idea.slug i))); 253 + Some (mem "title" (str (Bushel.Idea.title i))); 254 + Some (mem "description" (str (Bushel.Idea.body i))); 255 + Some (mem "year" (int_ (Bushel.Idea.year i))); 256 + Some (mem "date" (str (Printf.sprintf "%04d-%02d-%02d" y m d))); 257 + Some (mem "date_timestamp" (int64_ (date_to_timestamp date))); 258 + Some (mem "tags" (str_list (Bushel.Idea.tags i))); 259 + Some (mem "level" (str (Bushel.Idea.level_to_string (Bushel.Idea.level i)))); 260 + Some (mem "status" (str (Bushel.Idea.status_to_string (Bushel.Idea.status i)))); 261 + Some (mem "project" (str (Bushel.Idea.project i))); 262 + Some (mem "supervisors" (str_list (Bushel.Idea.supervisors i))); 263 + Some (mem "students" (str_list (Bushel.Idea.students i))); 264 + Some (mem "body" (str (Bushel.Idea.body i))); 265 + Some (mem "reading" (str (Bushel.Idea.reading i))); 248 266 ] 249 267 250 268 let video_to_document (v : Bushel.Video.t) = 251 269 let date = Bushel.Video.date v in 252 270 let (y, m, d) = date in 253 - `O ([ 254 - ("id", `String (Bushel.Video.uuid v)); 255 - ("title", `String (Bushel.Video.title v)); 256 - ("description", `String (Bushel.Video.description v)); 257 - ("published_date", `String (Ptime.to_rfc3339 (Bushel.Video.datetime v))); 258 - ("date", `String (Printf.sprintf "%04d-%02d-%02d" y m d)); 259 - ("date_timestamp", `Float (Int64.to_float (date_to_timestamp date))); 260 - ("tags", `A (List.map (fun t -> `String t) (Bushel.Video.tags v))); 261 - ("url", `String (Bushel.Video.url v)); 262 - ("uuid", `String (Bushel.Video.uuid v)); 263 - ("is_talk", `Bool (Bushel.Video.talk v)); 264 - ] @ 265 - (match Bushel.Video.paper v with Some p -> [("paper", `A [`String p])] | None -> []) @ 266 - (match Bushel.Video.project v with Some p -> [("project", `A [`String p])] | None -> [])) 271 + obj [ 272 + Some (mem "id" (str (Bushel.Video.uuid v))); 273 + Some (mem "title" (str (Bushel.Video.title v))); 274 + Some (mem "description" (str (Bushel.Video.description v))); 275 + Some (mem "published_date" (str (Ptime.to_rfc3339 (Bushel.Video.datetime v)))); 276 + Some (mem "date" (str (Printf.sprintf "%04d-%02d-%02d" y m d))); 277 + Some (mem "date_timestamp" (int64_ (date_to_timestamp date))); 278 + Some (mem "tags" (str_list (Bushel.Video.tags v))); 279 + Some (mem "url" (str (Bushel.Video.url v))); 280 + Some (mem "uuid" (str (Bushel.Video.uuid v))); 281 + Some (mem "is_talk" (bool_ (Bushel.Video.talk v))); 282 + opt_mem "paper" (Option.map (fun p -> str_list [p]) (Bushel.Video.paper v)); 283 + opt_mem "project" (Option.map (fun p -> str_list [p]) (Bushel.Video.project v)); 284 + ] 267 285 268 286 let contact_to_document (c : Sortal_schema.Contact.t) = 269 287 (* Extract atom feed URLs from Sortal feeds *) ··· 276 294 ) feeds 277 295 | None -> [] 278 296 in 279 - `O ([ 280 - ("id", `String (Sortal_schema.Contact.handle c)); 281 - ("handle", `String (Sortal_schema.Contact.handle c)); 282 - ("name", `String (Sortal_schema.Contact.name c)); 283 - ("names", `A (List.map (fun n -> `String n) (Sortal_schema.Contact.names c))); 284 - ] @ 285 - (match Sortal_schema.Contact.current_email c with Some e -> [("email", `A [`String e])] | None -> []) @ 286 - (match Sortal_schema.Contact.github_handle c with Some g -> [("github", `A [`String g])] | None -> []) @ 287 - (match Sortal_schema.Contact.twitter_handle c with Some t -> [("twitter", `A [`String t])] | None -> []) @ 288 - (match Sortal_schema.Contact.bluesky_handle c with Some b -> [("bluesky", `A [`String b])] | None -> []) @ 289 - (match Sortal_schema.Contact.mastodon_handle c with Some m -> [("mastodon", `A [`String m])] | None -> []) @ 290 - (match Sortal_schema.Contact.orcid c with Some o -> [("orcid", `A [`String o])] | None -> []) @ 291 - (match Sortal_schema.Contact.current_url c with Some u -> [("url", `A [`String u])] | None -> []) @ 292 - (if atom_urls = [] then [] else [("atom", `A (List.map (fun x -> `String x) atom_urls))])) 297 + obj [ 298 + Some (mem "id" (str (Sortal_schema.Contact.handle c))); 299 + Some (mem "handle" (str (Sortal_schema.Contact.handle c))); 300 + Some (mem "name" (str (Sortal_schema.Contact.name c))); 301 + Some (mem "names" (str_list (Sortal_schema.Contact.names c))); 302 + opt_mem "email" (Option.map (fun e -> str_list [e]) (Sortal_schema.Contact.current_email c)); 303 + opt_mem "github" (Option.map (fun g -> str_list [g]) (Sortal_schema.Contact.github_handle c)); 304 + opt_mem "twitter" (Option.map (fun t -> str_list [t]) (Sortal_schema.Contact.twitter_handle c)); 305 + opt_mem "bluesky" (Option.map (fun b -> str_list [b]) (Sortal_schema.Contact.bluesky_handle c)); 306 + opt_mem "mastodon" (Option.map (fun m -> str_list [m]) (Sortal_schema.Contact.mastodon_handle c)); 307 + opt_mem "orcid" (Option.map (fun o -> str_list [o]) (Sortal_schema.Contact.orcid c)); 308 + opt_mem "url" (Option.map (fun u -> str_list [u]) (Sortal_schema.Contact.current_url c)); 309 + (if atom_urls = [] then None else Some (mem "atom" (str_list atom_urls))); 310 + ] 311 + 312 + (** {1 Document ID Extraction} *) 313 + 314 + let get_doc_id (doc : Jsont.json) : string option = 315 + match doc with 316 + | Jsont.Object (fields, _) -> 317 + List.find_map (fun ((name, _), v) -> 318 + if name = "id" then 319 + match v with Jsont.String (s, _) -> Some s | _ -> None 320 + else None 321 + ) fields 322 + | _ -> None 323 + 324 + (** {1 Sync State} *) 325 + 326 + type sync_stats = { 327 + mutable created : int; 328 + mutable updated : int; 329 + mutable deleted : int; 330 + mutable unchanged : int; 331 + mutable errors : int; 332 + } 333 + 334 + let empty_stats () = { 335 + created = 0; 336 + updated = 0; 337 + deleted = 0; 338 + unchanged = 0; 339 + errors = 0; 340 + } 341 + 342 + (** {1 Collection Sync} *) 343 + 344 + type collection_sync_result = { 345 + collection : string; 346 + stats : sync_stats; 347 + details : string list; 348 + } 349 + 350 + (** Ensure a collection exists, creating it if necessary *) 351 + let ensure_collection (client : Typesense_auth.Client.t) (schema : Typesense.CollectionSchema.T.t) = 352 + let name = Typesense.CollectionSchema.T.name schema in 353 + let ts = Typesense_auth.Client.client client in 354 + try 355 + let _ = Typesense.Collection.get_collection ~collection_name:name ts () in 356 + Log.debug (fun m -> m "Collection %s already exists" name); 357 + `Exists 358 + with _ -> 359 + Log.info (fun m -> m "Creating collection %s" name); 360 + let _ = Typesense.Collection.create_collection ~body:schema ts () in 361 + `Created 362 + 363 + (** Get existing document IDs from a collection *) 364 + let get_existing_ids (client : Typesense_auth.Client.t) ~collection : string list = 365 + try 366 + let params = Typesense_auth.Client.export_params ~include_fields:["id"] () in 367 + let docs = Typesense_auth.Client.export client ~collection ~params () in 368 + List.filter_map get_doc_id docs 369 + with _ -> 370 + Log.warn (fun m -> m "Failed to export existing documents from %s" collection); 371 + [] 372 + 373 + (** Sync a single collection incrementally *) 374 + let sync_collection ~dry_run (client : Typesense_auth.Client.t) 375 + ~collection ~(schema : Typesense.CollectionSchema.T.t) 376 + ~(documents : Jsont.json list) : collection_sync_result = 377 + let stats = empty_stats () in 378 + let details = ref [] in 379 + let name = Typesense.CollectionSchema.T.name schema in 380 + 381 + (* Get IDs of new documents *) 382 + let new_ids = List.filter_map get_doc_id documents in 383 + let new_id_set = List.fold_left (fun s id -> 384 + Hashtbl.replace s id (); s 385 + ) (Hashtbl.create (List.length new_ids)) new_ids in 386 + 387 + if dry_run then begin 388 + (* In dry-run mode, just report what would happen *) 389 + begin try 390 + let existing_ids = get_existing_ids client ~collection in 391 + let existing_set = List.fold_left (fun s id -> 392 + Hashtbl.replace s id (); s 393 + ) (Hashtbl.create (List.length existing_ids)) existing_ids in 394 + 395 + (* Count creates (in new but not existing) *) 396 + List.iter (fun id -> 397 + if not (Hashtbl.mem existing_set id) then begin 398 + stats.created <- stats.created + 1; 399 + if stats.created <= 5 then 400 + details := (Printf.sprintf "Would create: %s" id) :: !details 401 + end else 402 + stats.unchanged <- stats.unchanged + 1 403 + ) new_ids; 404 + 405 + (* Count deletes (in existing but not new) *) 406 + List.iter (fun id -> 407 + if not (Hashtbl.mem new_id_set id) then begin 408 + stats.deleted <- stats.deleted + 1; 409 + if stats.deleted <= 5 then 410 + details := (Printf.sprintf "Would delete: %s" id) :: !details 411 + end 412 + ) existing_ids; 413 + 414 + if stats.created > 5 then 415 + details := (Printf.sprintf "...and %d more creates" (stats.created - 5)) :: !details; 416 + if stats.deleted > 5 then 417 + details := (Printf.sprintf "...and %d more deletes" (stats.deleted - 5)) :: !details 418 + with _ -> 419 + (* Collection doesn't exist yet *) 420 + stats.created <- List.length documents; 421 + details := [Printf.sprintf "Would create collection with %d documents" stats.created] 422 + end 423 + end else begin 424 + (* Actual sync *) 425 + (* Ensure collection exists *) 426 + let _ = ensure_collection client schema in 427 + 428 + (* Get existing document IDs *) 429 + let existing_ids = get_existing_ids client ~collection in 430 + let existing_set = List.fold_left (fun s id -> 431 + Hashtbl.replace s id (); s 432 + ) (Hashtbl.create (List.length existing_ids)) existing_ids in 433 + 434 + (* Upsert all new documents *) 435 + if documents <> [] then begin 436 + Log.info (fun m -> m "Upserting %d documents to %s" (List.length documents) name); 437 + let results = Typesense_auth.Client.import client ~collection ~action:Upsert documents in 438 + List.iter (fun (r : Typesense_auth.Client.import_result) -> 439 + if r.success then begin 440 + (* Check if this was a create or update *) 441 + match r.document with 442 + | Some doc_str -> 443 + (try 444 + match Jsont_bytesrw.decode_string Jsont.json doc_str with 445 + | Ok doc -> 446 + (match get_doc_id doc with 447 + | Some id when not (Hashtbl.mem existing_set id) -> 448 + stats.created <- stats.created + 1 449 + | _ -> 450 + stats.updated <- stats.updated + 1) 451 + | Error _ -> stats.updated <- stats.updated + 1 452 + with _ -> stats.updated <- stats.updated + 1) 453 + | None -> 454 + stats.updated <- stats.updated + 1 455 + end else begin 456 + stats.errors <- stats.errors + 1; 457 + match r.error with 458 + | Some e -> details := e :: !details 459 + | None -> () 460 + end 461 + ) results 462 + end; 463 + 464 + (* Delete documents that no longer exist *) 465 + let to_delete = List.filter (fun id -> 466 + not (Hashtbl.mem new_id_set id) 467 + ) existing_ids in 468 + 469 + if to_delete <> [] then begin 470 + Log.info (fun m -> m "Deleting %d documents from %s" (List.length to_delete) name); 471 + let ts = Typesense_auth.Client.client client in 472 + List.iter (fun id -> 473 + try 474 + let _ = Typesense.Client.delete_document 475 + ~collection_name:collection ~document_id:id ts () in 476 + stats.deleted <- stats.deleted + 1 477 + with e -> 478 + stats.errors <- stats.errors + 1; 479 + details := (Printf.sprintf "Failed to delete %s: %s" id (Printexc.to_string e)) :: !details 480 + ) to_delete 481 + end 482 + end; 483 + 484 + { collection; stats; details = List.rev !details } 485 + 486 + (** {1 Full Sync} *) 487 + 488 + type sync_result = { 489 + collections : collection_sync_result list; 490 + total_created : int; 491 + total_updated : int; 492 + total_deleted : int; 493 + total_errors : int; 494 + } 495 + 496 + let sync ~dry_run ~(client : Typesense_auth.Client.t) ~(entries : Bushel.Entry.t) : sync_result = 497 + Log.info (fun m -> m "%s Typesense collections..." 498 + (if dry_run then "Checking" else "Syncing")); 499 + 500 + (* Prepare documents for each collection *) 501 + let notes_docs = List.map note_to_document (Bushel.Entry.notes entries) in 502 + let papers_docs = List.map paper_to_document (Bushel.Entry.papers entries) in 503 + let projects_docs = List.map project_to_document (Bushel.Entry.projects entries) in 504 + let ideas_docs = List.map idea_to_document (Bushel.Entry.ideas entries) in 505 + let videos_docs = List.map video_to_document (Bushel.Entry.videos entries) in 506 + let contacts_docs = List.map contact_to_document (Bushel.Entry.contacts entries) in 507 + 508 + (* Sync each collection *) 509 + let collections = [ 510 + sync_collection ~dry_run client ~collection:"notes" ~schema:notes_schema ~documents:notes_docs; 511 + sync_collection ~dry_run client ~collection:"papers" ~schema:papers_schema ~documents:papers_docs; 512 + sync_collection ~dry_run client ~collection:"projects" ~schema:projects_schema ~documents:projects_docs; 513 + sync_collection ~dry_run client ~collection:"ideas" ~schema:ideas_schema ~documents:ideas_docs; 514 + sync_collection ~dry_run client ~collection:"videos" ~schema:videos_schema ~documents:videos_docs; 515 + sync_collection ~dry_run client ~collection:"contacts" ~schema:contacts_schema ~documents:contacts_docs; 516 + ] in 517 + 518 + (* Calculate totals *) 519 + let total_created = List.fold_left (fun acc r -> acc + r.stats.created) 0 collections in 520 + let total_updated = List.fold_left (fun acc r -> acc + r.stats.updated) 0 collections in 521 + let total_deleted = List.fold_left (fun acc r -> acc + r.stats.deleted) 0 collections in 522 + let total_errors = List.fold_left (fun acc r -> acc + r.stats.errors) 0 collections in 523 + 524 + Log.info (fun m -> m "Sync complete: %d created, %d updated, %d deleted, %d errors" 525 + total_created total_updated total_deleted total_errors); 526 + 527 + { collections; total_created; total_updated; total_deleted; total_errors }
+1 -1
lib_typesense/dune
··· 1 1 (library 2 2 (name bushel_typesense) 3 3 (public_name bushel.typesense) 4 - (libraries bushel jsont ptime sortal.schema)) 4 + (libraries bushel jsont jsont.bytesrw ptime sortal.schema typesense typesense.auth logs))