My personal data management layer
1(*---------------------------------------------------------------------------
2 Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved.
3 SPDX-License-Identifier: ISC
4 ---------------------------------------------------------------------------*)
5
6(** Utility functions for Bushel *)
7
8(** Count words in a string. *)
9let count_words (text : string) : int =
10 let len = String.length text in
11 let rec count_words_helper (index : int) (in_word : bool) (count : int) : int =
12 if index >= len then
13 if in_word then count + 1 else count
14 else
15 let char = String.get text index in
16 let is_whitespace =
17 Char.equal char ' '
18 || Char.equal char '\t'
19 || Char.equal char '\n'
20 || Char.equal char '\r'
21 in
22 if is_whitespace then
23 if in_word then count_words_helper (index + 1) false (count + 1)
24 else count_words_helper (index + 1) false count
25 else count_words_helper (index + 1) true count
26 in
27 count_words_helper 0 false 0
28
29(** Get the first paragraph/hunk from text (up to double newline). *)
30let first_hunk s =
31 let lines = String.split_on_char '\n' s in
32 let rec aux acc = function
33 | [] -> String.concat "\n" (List.rev acc)
34 | "" :: "" :: _ -> String.concat "\n" (List.rev acc)
35 | line :: rest -> aux (line :: acc) rest
36 in
37 aux [] lines
38
39(** Get first and last hunks from text. *)
40let first_and_last_hunks s =
41 let lines = String.split_on_char '\n' s in
42 let rec aux acc = function
43 | [] -> String.concat "\n" (List.rev acc), ""
44 | "" :: "" :: rest ->
45 String.concat "\n" (List.rev acc), String.concat "\n" (List.rev rest)
46 | line :: rest -> aux (line :: acc) rest
47 in
48 aux [] lines
49
50(** Find all footnote definition lines in text. *)
51let find_footnote_lines s =
52 let lines = String.split_on_char '\n' s in
53 let is_footnote_def line =
54 String.length line > 3 &&
55 line.[0] = '[' &&
56 line.[1] = '^' &&
57 String.contains line ':' &&
58 let colon_pos = String.index line ':' in
59 colon_pos > 2 && line.[colon_pos - 1] = ']'
60 in
61 let is_continuation line =
62 String.length line > 0 && (line.[0] = ' ' || line.[0] = '\t')
63 in
64 let rec collect_footnotes acc in_footnote = function
65 | [] -> List.rev acc
66 | line :: rest ->
67 if is_footnote_def line then
68 collect_footnotes (line :: acc) true rest
69 else if in_footnote && is_continuation line then
70 collect_footnotes (line :: acc) true rest
71 else
72 collect_footnotes acc false rest
73 in
74 collect_footnotes [] false lines
75
76(** Augment first hunk with footnote definitions from last hunk. *)
77let first_hunk_with_footnotes s =
78 let first, last = first_and_last_hunks s in
79 let footnote_lines = find_footnote_lines last in
80 if footnote_lines = [] then first
81 else first ^ "\n\n" ^ String.concat "\n" footnote_lines
82
83(** Trim leading/trailing whitespace and normalize multiple blank lines. *)
84let normalize_body s =
85 let trimmed = String.trim s in
86 (* Replace 3+ consecutive newlines with exactly 2 newlines *)
87 let re = Re.compile (Re.seq [Re.char '\n'; Re.char '\n'; Re.rep1 (Re.char '\n')]) in
88 Re.replace_string re ~by:"\n\n" trimmed
89
90(** Extract domain from URL. *)
91let extract_domain url =
92 try
93 let uri = Uri.of_string url in
94 match Uri.host uri with
95 | Some host -> host
96 | None -> "unknown"
97 with _ -> "unknown"
98
99(** Check if a string is a valid URL. *)
100let is_url s =
101 String.starts_with ~prefix:"http://" s || String.starts_with ~prefix:"https://" s