My personal data management layer

Fix JSON feed differences: slugs, PDFs, DOI cache

- Normalize slugs to match Jekyll behavior (dots → dashes)
so geotessera-python-0.7 becomes geotessera-python-0-7

- Add filesystem PDF check to JSON feed attachments
matching arod_papers.ml behavior for static/papers/*.pdf

- Add DOI cache infrastructure for external references:
- New bushel_doi_entry.ml module for parsing data/doi.yml
- Load DOI entries in bushel_loader.ml
- Complete note_references with external DOI URL scanning
- Scan for doi.org URLs and publisher URLs (Elsevier, IEEE, etc.)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

+172 -6
+3
lib/bushel.ml
··· 84 84 module Types = Bushel_types 85 85 (** Common types and Jsont codecs. *) 86 86 87 + module Doi_entry = Bushel_doi_entry 88 + (** DOI entries resolved from external sources. *) 89 + 87 90 module Util = Bushel_util 88 91 (** Utility functions (word counting, text processing). *)
+98
lib/bushel_doi_entry.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** DOI entries resolved from external sources via Zotero Translation Server *) 7 + 8 + type status = 9 + | Resolved 10 + | Failed of string 11 + 12 + type t = { 13 + doi : string; 14 + title : string; 15 + authors : string list; 16 + year : int; 17 + bibtype : string; 18 + publisher : string; 19 + resolved_at : string; 20 + source_urls : string list; 21 + status : status; 22 + ignore : bool; 23 + } 24 + 25 + type ts = t list 26 + 27 + let get_string key fields = 28 + match List.assoc_opt key fields with 29 + | Some (`String s) -> s 30 + | _ -> "" 31 + 32 + let get_string_opt key fields = 33 + match List.assoc_opt key fields with 34 + | Some (`String s) -> Some s 35 + | _ -> None 36 + 37 + let get_int key fields = 38 + match List.assoc_opt key fields with 39 + | Some (`Float f) -> int_of_float f 40 + | _ -> 0 41 + 42 + let get_bool key fields = 43 + match List.assoc_opt key fields with 44 + | Some (`Bool b) -> b 45 + | _ -> false 46 + 47 + let get_strings key fields = 48 + match List.assoc_opt key fields with 49 + | Some (`A items) -> 50 + List.filter_map (function `String s -> Some s | _ -> None) items 51 + | _ -> [] 52 + 53 + let of_yaml_value = function 54 + | `O fields -> 55 + let doi = get_string "doi" fields in 56 + let resolved_at = get_string "resolved_at" fields in 57 + let source_urls = 58 + match get_strings "source_urls" fields with 59 + | [] -> 60 + (match get_string_opt "source_url" fields with 61 + | Some u -> [u] 62 + | None -> []) 63 + | urls -> urls 64 + in 65 + let ignore = get_bool "ignore" fields in 66 + let error = get_string_opt "error" fields in 67 + (match error with 68 + | Some err -> 69 + Some { doi; title = ""; authors = []; year = 0; bibtype = ""; 70 + publisher = ""; resolved_at; source_urls; 71 + status = Failed err; ignore } 72 + | None -> 73 + let title = get_string "title" fields in 74 + let authors = get_strings "authors" fields in 75 + let year = get_int "year" fields in 76 + let bibtype = get_string "bibtype" fields in 77 + let publisher = get_string "publisher" fields in 78 + Some { doi; title; authors; year; bibtype; publisher; 79 + resolved_at; source_urls; status = Resolved; ignore }) 80 + | _ -> None 81 + 82 + (** Load DOI entries from a YAML string *) 83 + let of_yaml_string str = 84 + try 85 + match Yamlrw.of_string str with 86 + | `A entries -> List.filter_map of_yaml_value entries 87 + | _ -> [] 88 + with Yamlrw.Yamlrw_error _ -> [] 89 + 90 + (** Find entry by DOI (excludes ignored entries) *) 91 + let find_by_doi entries doi = 92 + List.find_opt (fun entry -> not entry.ignore && entry.doi = doi) entries 93 + 94 + (** Find entry by source URL (excludes ignored entries) *) 95 + let find_by_url entries url = 96 + List.find_opt (fun entry -> 97 + not entry.ignore && List.mem url entry.source_urls 98 + ) entries
+4 -2
lib/bushel_entry.ml
··· 27 27 images : Srcsetter.t list; 28 28 image_index : (string, Srcsetter.t) Hashtbl.t; 29 29 data_dir : string; 30 + doi_entries : Bushel_doi_entry.ts; 30 31 } 31 32 32 33 (** {1 Constructors} *) 33 34 34 - let v ~papers ~notes ~projects ~ideas ~videos ~contacts ?(images=[]) ~data_dir () = 35 + let v ~papers ~notes ~projects ~ideas ~videos ~contacts ?(images=[]) ?(doi_entries=[]) ~data_dir () = 35 36 let slugs : slugs = Hashtbl.create 42 in 36 37 let papers, old_papers = List.partition (fun p -> p.Bushel_paper.latest) papers in 37 38 List.iter (fun n -> Hashtbl.add slugs n.Bushel_note.slug (`Note n)) notes; ··· 42 43 (* Build image index *) 43 44 let image_index = Hashtbl.create (List.length images) in 44 45 List.iter (fun img -> Hashtbl.add image_index (Srcsetter.slug img) img) images; 45 - { slugs; papers; old_papers; notes; projects; ideas; videos; contacts; images; image_index; data_dir } 46 + { slugs; papers; old_papers; notes; projects; ideas; videos; contacts; images; image_index; data_dir; doi_entries } 46 47 47 48 (** {1 Accessors} *) 48 49 ··· 55 56 let old_papers { old_papers; _ } = old_papers 56 57 let images { images; _ } = images 57 58 let data_dir { data_dir; _ } = data_dir 59 + let doi_entries { doi_entries; _ } = doi_entries 58 60 59 61 (** {1 Image Lookup} *) 60 62
+2
lib/bushel_entry.mli
··· 30 30 videos:Bushel_video.t list -> 31 31 contacts:Sortal_schema.Contact.t list -> 32 32 ?images:Srcsetter.t list -> 33 + ?doi_entries:Bushel_doi_entry.ts -> 33 34 data_dir:string -> 34 35 unit -> 35 36 t ··· 46 47 val old_papers : t -> Bushel_paper.ts 47 48 val images : t -> Srcsetter.t list 48 49 val data_dir : t -> string 50 + val doi_entries : t -> Bushel_doi_entry.ts 49 51 50 52 (** {1 Lookup Functions} *) 51 53
+52 -3
lib/bushel_md.ml
··· 882 882 | _ -> () 883 883 ) slugs; 884 884 885 - (* TODO: Add external DOI URL scanning and publisher URL resolution *) 886 - (* This requires DOI caching infrastructure which is not yet ported *) 885 + (* Scan body for external DOI URLs and resolve from cache *) 886 + let body = Bushel_note.body note in 887 + let doi_url_pattern = Re.Perl.compile_pat "https?://(?:dx\\.)?doi\\.org/([^)\\s\"'>]+)" in 888 + let doi_matches = Re.all doi_url_pattern body in 889 + let doi_entries = Bushel_entry.doi_entries entries in 890 + List.iter (fun group -> 891 + try 892 + let encoded_doi = Re.Group.get group 1 in 893 + let doi = Uri.pct_decode encoded_doi in 894 + if not (List.exists (fun (d, _, _) -> d = doi) !refs) then 895 + match Bushel_doi_entry.find_by_doi doi_entries doi with 896 + | Some doi_entry when doi_entry.status = Resolved -> 897 + let citation = format_citation 898 + ~authors:doi_entry.authors 899 + ~year:doi_entry.year 900 + ~title:doi_entry.title 901 + ~publisher:(Some doi_entry.publisher) 902 + in 903 + refs := (doi, citation, External) :: !refs 904 + | _ -> 905 + refs := (doi, doi, External) :: !refs 906 + with _ -> () 907 + ) doi_matches; 908 + 909 + (* Scan body for publisher URLs and resolve from DOI cache *) 910 + let publisher_pattern = Re.Perl.compile_pat "https?://(?:(?:www\\.)?(?:linkinghub\\.elsevier\\.com|(?:www\\.)?sciencedirect\\.com/science/article|ieeexplore\\.ieee\\.org|academic\\.oup\\.com|nature\\.com|journals\\.sagepub\\.com|garfield\\.library\\.upenn\\.edu|link\\.springer\\.com|arxiv\\.org/abs)/[^)\\s\"'>]+|(?:dl\\.acm\\.org|(?:www\\.)?tandfonline\\.com)/doi(?:/pdf)?/10\\.[^)\\s\"'>]+)" in 911 + let publisher_matches = Re.all publisher_pattern body in 912 + List.iter (fun group -> 913 + try 914 + let url = Re.Group.get group 0 in 915 + match Bushel_doi_entry.find_by_url doi_entries url with 916 + | Some doi_entry when doi_entry.status = Resolved -> 917 + let doi = doi_entry.doi in 918 + if not (List.exists (fun (d, _, _) -> d = doi) !refs) then 919 + let citation = format_citation 920 + ~authors:doi_entry.authors 921 + ~year:doi_entry.year 922 + ~title:doi_entry.title 923 + ~publisher:(Some doi_entry.publisher) 924 + in 925 + refs := (doi, citation, External) :: !refs 926 + | _ -> () 927 + with _ -> () 928 + ) publisher_matches; 887 929 888 - List.rev !refs 930 + (* Filter out the note's own DOI from references *) 931 + let own_doi = Bushel_note.doi note in 932 + let filtered_refs = List.filter (fun (doi, _, _) -> 933 + match own_doi with 934 + | Some own -> doi <> own 935 + | None -> true 936 + ) !refs in 937 + List.rev filtered_refs
+13 -1
lib_eio/bushel_loader.ml
··· 139 139 | None -> [] 140 140 in 141 141 Log.info (fun m -> m "Loaded %d images" (List.length images)); 142 + let doi_entries = 143 + let doi_path = Filename.concat base "data/doi.yml" in 144 + try 145 + let content = Eio.Path.load Eio.Path.(fs / doi_path) in 146 + let entries = Bushel.Doi_entry.of_yaml_string content in 147 + Log.info (fun m -> m "Loaded %d DOI entries from %s" (List.length entries) doi_path); 148 + entries 149 + with 150 + | Eio.Io (Eio.Fs.E (Eio.Fs.Not_found _), _) -> 151 + Log.info (fun m -> m "No DOI cache found at %s" doi_path); 152 + [] 153 + in 142 154 let data_dir = Filename.concat base "data" in 143 - let entries = Bushel.Entry.v ~papers ~notes ~projects ~ideas ~videos ~contacts ~images ~data_dir () in 155 + let entries = Bushel.Entry.v ~papers ~notes ~projects ~ideas ~videos ~contacts ~images ~doi_entries ~data_dir () in 144 156 Log.info (fun m -> m "Building link graph"); 145 157 let graph = build_link_graph entries in 146 158 Bushel.Link_graph.set_graph graph;