OCaml API to the Zotero translation server to get DOI metadata
1(*---------------------------------------------------------------------------
2 Copyright (c) 2019 University of Bern. All rights reserved.
3 Distributed under the ISC license, see terms at the end of the file.
4 ---------------------------------------------------------------------------*)
5
6
7module SM = Map.Make(String)
8
9module Tloc = struct
10 type fpath = string
11 let pp_path = Format.pp_print_string
12
13 type pos = int
14 type line = int
15
16 type t =
17 { file : fpath;
18 sbyte : pos; ebyte : pos;
19 sline : pos * line; eline : pos * line }
20
21 let no_file = "-"
22 let v ~file ~sbyte ~ebyte ~sline ~eline = { file; sbyte; ebyte; sline; eline }
23
24 let pf = Format.fprintf
25
26 let pp_gnu ppf l =
27 if l.ebyte < 0 then pf ppf "%a:" pp_path l.file
28 else
29 let pp_lines ppf l =
30 let col_s = l.sbyte - snd l.sline + 1 in
31 let col_e = l.ebyte - snd l.eline + 1 in
32 if fst l.sline = fst l.eline then
33 pf ppf "%d.%d-%d" (fst l.sline) col_s col_e
34 else
35 pf ppf "%d.%d-%d.%d" (fst l.sline) col_s (fst l.eline) col_e
36 in
37 pf ppf "%a:%a" pp_path l.file pp_lines l
38
39 let pp = pp_gnu
40end
41
42module Utf_8 = struct
43 type case =
44 | L1 | L2 | L3_E0 | L3_E1_EC_or_EE_EF | L3_ED | L4_F0 | L4_F1_F3 | L4_F4 | E
45
46 let case =
47(*
48 (* See https://tools.ietf.org/html/rfc3629#section-4 *)
49 Printf.printf "[|";
50 for i = 0 to 255 do
51 if i mod 16 = 0 then Printf.printf "\n";
52 if 0x00 <= i && i <= 0x7F then Printf.printf "L1; " else
53 if 0xC2 <= i && i <= 0xDF then Printf.printf "L2; " else
54 if 0xE0 = i then Printf.printf "L3_E0; " else
55 if 0xE1 <= i && i <= 0xEC || 0xEE <= i && i <= 0xEF
56 then Printf.printf "L3_E1_EC_or_EE_EF; " else
57 if 0xED = i then Printf.printf "L3_ED;" else
58 if 0xF0 = i then Printf.printf "L4_F0; " else
59 if 0xF1 <= i && i <= 0xF3 then Printf.printf "L4_F1_F3; " else
60 if 0xF4 = i then Printf.printf "L4_F4; " else
61 Printf.printf "E; "
62 done;
63 Printf.printf "\n|]"
64*)
65 [|
66 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
67 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
68 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
69 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
70 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
71 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
72 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
73 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1;
74 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E;
75 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E;
76 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E;
77 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E;
78 E; E; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2;
79 L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2;
80 L3_E0; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF;
81 L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF;
82 L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF;
83 L3_E1_EC_or_EE_EF; L3_ED;L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF;
84 L4_F0; L4_F1_F3; L4_F1_F3; L4_F1_F3; L4_F4; E; E; E; E; E; E; E; E; E; E; E;
85 |]
86end
87
88module Tdec = struct
89 type t =
90 { file : Tloc.fpath; i : string; tok : Buffer.t;
91 mutable pos : int; mutable line : int; mutable line_pos : int; }
92
93 let create ?(file = Tloc.no_file) i =
94 { file; i; tok = Buffer.create 255; pos = 0; line = 1; line_pos = 0 }
95
96 let pos d = d.pos
97 let line d = d.line, d.line_pos
98
99 let loc_to_here d ~sbyte ~sline =
100 Tloc.v ~file:d.file ~sbyte ~ebyte:d.pos ~sline ~eline:(d.line, d.line_pos)
101
102 let loc_here d = loc_to_here d ~sbyte:d.pos ~sline:(d.line, d.line_pos)
103
104 exception Err of Tloc.t * string
105
106 let err loc msg = raise_notrace (Err (loc, msg))
107
108 let err_to_here d ~sbyte ~sline fmt =
109 Format.kasprintf (err (loc_to_here d ~sbyte ~sline)) fmt
110
111 let err_here d fmt = Format.kasprintf (err (loc_here d)) fmt
112
113 let incr_line d =
114 match d.i.[d.pos] with
115 | '\r' -> d.line <- d.line + 1; d.line_pos <- d.pos + 1
116 | '\n' ->
117 (if d.pos = 0 || d.i.[d.pos - 1] <> '\r' then d.line <- d.line + 1);
118 d.line_pos <- d.pos + 1
119 | _ -> ()
120 [@@ocaml.inline]
121
122 let eoi d = d.pos >= String.length d.i [@@ocaml.inline]
123 let byte d = if eoi d then 0xFFFF else Char.code d.i.[d.pos] [@@ocaml.inline]
124 let accept_byte d = incr_line d; d.pos <- d.pos + 1 [@@ocaml.inline]
125
126 let accept_utf_8 accept d =
127 let err d =
128 match byte d with
129 | 0xFFFF -> err_here d "UTF-8 decoding error: unexpected end of input"
130 | b -> err_here d "UTF-8 decoding error: byte %02x illegal here" b
131 in
132 let accept_tail d = if byte d lsr 6 = 0b10 then accept d else err d in
133 match byte d with
134 | 0xFFFF -> err d
135 | b ->
136 match Utf_8.case.(b) with
137 | L1 -> accept d
138 | L2 -> accept d; accept_tail d
139 | L3_E0 ->
140 accept d;
141 if byte d - 0xA0 < 0xBF - 0xA0 then accept d else err d;
142 accept_tail d
143 | L3_E1_EC_or_EE_EF -> accept d; accept_tail d; accept_tail d
144 | L3_ED ->
145 accept d;
146 if byte d - 0x80 < 0x9F - 0x80 then accept d else err d;
147 accept_tail d
148 | L4_F0 ->
149 accept d;
150 if byte d - 0x90 < 0xBF - 0x90 then accept d else err d;
151 accept_tail d; accept_tail d
152 | L4_F1_F3 ->
153 accept d;
154 accept_tail d; accept_tail d; accept_tail d
155 | L4_F4 ->
156 accept d;
157 if byte d - 0x80 < 0x8F - 0x80 then accept d else err d
158 | E -> err d
159
160 let tok_reset d = Buffer.reset d.tok [@@ocaml.inline]
161 let tok_pop d = let t = Buffer.contents d.tok in tok_reset d; t [@@ocaml.inline]
162 let tok_accept_byte d = Buffer.add_char d.tok d.i.[d.pos]; accept_byte d [@@ocaml.inline]
163 let tok_accept_uchar d = accept_utf_8 tok_accept_byte d [@@ocaml.inline]
164end
165
166module Url = struct
167 let string_subrange ?(first = 0) ?last s =
168 let max = String.length s - 1 in
169 let last = match last with
170 | None -> max
171 | Some l when l > max -> max
172 | Some l -> l
173 in
174 let first = if first < 0 then 0 else first in
175 if first > last then "" else String.sub s first (last - first + 1)
176
177 let alpha = function 'A' .. 'Z' | 'a' .. 'z' -> true | _ -> false
178 let digit = function '0' .. '9' -> true | _ -> false
179
180 let scheme_char c =
181 alpha c || digit c || Char.equal c '+' || Char.equal c '-' ||
182 Char.equal '.' c
183
184 let find_scheme_colon u =
185 if u = "" || not (alpha u.[0]) then None
186 else
187 let max = String.length u - 1 in
188 let i = ref 1 in
189 while !i <= max && scheme_char u.[!i] do incr i done;
190 if !i > max || u.[!i] <> ':' then None else Some !i
191
192 let find_authority_last ~start u =
193 let max = String.length u - 1 in
194 if start > max then None
195 else if start + 1 > max then Some (start - 1)
196 else if not (u.[start] = '/' && u.[start + 1] = '/') then Some (start - 1)
197 else
198 let i = ref (start + 2) in
199 while !i <= max && u.[!i] <> '/' && u.[!i] <> '?' && u.[!i] <> '#' do
200 incr i
201 done;
202 Some (!i - 1)
203
204 let scheme u =
205 Option.map (fun i -> String.sub u 0 i) (find_scheme_colon u)
206
207 let path_first u =
208 let start = Option.value ~default:0 (Option.map succ (find_scheme_colon u)) in
209 let first = Option.value ~default:start (Option.map succ (find_authority_last ~start u)) in
210 let max = String.length u - 1 in
211 if first > max || u.[first] = '#' || u.[first] = '?' then None else Some first
212
213 let path_last u ~first =
214 let max = String.length u - 1 in
215 let i = ref (first + 1) in
216 while !i <= max && u.[!i] <> '?' && u.[!i] <> '#' do incr i done;
217 !i - 1
218
219 let path u =
220 Option.map (fun first -> string_subrange ~first ~last:(path_last u ~first) u) (path_first u)
221end
222
223let escape = (* The escape rules are a bit unclear. These are those of LaTeX *)
224 let byte_replaced_length char_len s =
225 let rec loop s max i l = match i > max with
226 | true -> l
227 | false -> loop s max (i + 1) (l + char_len s.[i])
228 in
229 loop s (String.length s - 1) 0 0
230 in
231 let byte_replace set_char s ~len ~replaced_len =
232 let b = Bytes.create replaced_len in
233 let rec loop s max i k = match i > max with
234 | true -> Bytes.unsafe_to_string b
235 | false -> loop s max (i + 1) (set_char b k s.[i])
236 in
237 loop s (len - 1) 0 0
238 in
239 let byte_escaper char_len set_char s =
240 let len = String.length s in
241 let replaced_len = byte_replaced_length char_len s in
242 match replaced_len = len with
243 | true -> s
244 | false -> byte_replace set_char s ~len ~replaced_len
245 in
246 let tilde_esc = "\\textasciitilde" in
247 let tilde_len = String.length tilde_esc in
248 let circ_esc = "\\textasciicircum" in
249 let circ_len = String.length circ_esc in
250 let bslash_esc = "\\textbackslash" in
251 let bslash_len = String.length bslash_esc in
252 let char_len = function
253 | '&' | '%' | '$' | '#' | '_' | '{' | '}' -> 2
254 | '~' -> tilde_len
255 | '^' -> circ_len
256 | '\\' -> bslash_len
257 | _ -> 1
258 in
259 let set_char b i = function
260 | '&' | '%' | '$' | '#' | '_' | '{' | '}' as c ->
261 Bytes.set b i '\\'; Bytes.set b (i + 1) c; i + 2
262 | '~' -> Bytes.blit_string tilde_esc 0 b i tilde_len; i + tilde_len
263 | '^' -> Bytes.blit_string circ_esc 0 b i circ_len; i + circ_len
264 | '\\' -> Bytes.blit_string bslash_esc 0 b i bslash_len; i + bslash_len
265 | c -> Bytes.set b i c; i + 1
266 in
267 byte_escaper char_len set_char
268
269(* TODO unescape on decode. *)
270
271type t =
272 { type' : string;
273 cite_key : string;
274 fields : string SM.t; }
275
276let v ~type' ~cite_key ~fields () = { type'; cite_key; fields }
277
278let type' e = e.type'
279let cite_key e = e.cite_key
280let fields e = e.fields
281
282let pp ppf e =
283 let pp_field ppf (k, v) = Fmt.pf ppf "@[<h>%s = {%s}@]" k (escape v) in
284 Fmt.pf ppf "@[<v2>@%s{%s,@,%a}@]" e.type' e.cite_key
285 (Fmt.iter_bindings ~sep:Fmt.comma SM.iter pp_field) e.fields
286
287(* Field values *)
288
289let list_value s =
290 List.filter (fun s -> s <> "") @@
291 List.map String.trim (String.split_on_char ',' s)
292
293let doi e = match SM.find_opt "doi" e.fields with
294| None -> None
295| Some doi ->
296 let ret doi = match String.trim doi with
297 | "" -> None
298 | doi -> Some doi
299 in
300 (* chop scheme and authority in case there is one *)
301 match Url.scheme doi with
302 | None -> ret doi
303 | Some _ ->
304 match Url.path doi with
305 | None -> ret doi
306 | Some p -> ret p
307
308let keywords e = Option.map list_value (SM.find_opt "keywords" e.fields)
309let annote e = SM.find_opt "annote" e.fields
310
311(* Codec *)
312
313type error_kind = string
314type error = error_kind * Tloc.t
315
316let pp_error ppf (err, l) =
317 Fmt.pf ppf "@[<v>%a:@,%a: %s@]"
318 Tloc.pp l Fmt.string "Error" err
319
320let curr_char d = (* TODO better escaping (this is for error reports) *)
321 Tdec.tok_reset d; Tdec.tok_accept_uchar d; Tdec.tok_pop d
322
323let err_illegal_uchar d = Tdec.err_here d "illegal character: %s" (curr_char d)
324let err_illegal_byte d b = Tdec.err_here d "illegal character U+%04X" b
325let err_expected d exp = Tdec.err_here d "expected %s" exp
326let err_eoi msg d ~sbyte ~sline =
327 Tdec.err_to_here d ~sbyte ~sline "end of input: %s" msg
328
329let err_eoi_entry = err_eoi "unclosed BibTeX entry"
330let err_eoi_field = err_eoi "unfinished BibTeX entry field"
331let err_eoi_value = err_eoi "unfinished BibTeX field value"
332
333let dec_byte d = match Tdec.byte d with
334| c when 0x00 <= c && c <= 0x08 || 0x0E <= c && c <= 0x1F || c = 0x7F ->
335 err_illegal_byte d c
336| c -> c
337[@@ ocaml.inline]
338
339let rec skip_white d = match dec_byte d with
340| 0x20 | 0x09 | 0x0A | 0x0B | 0x0C | 0x0D -> Tdec.accept_byte d; skip_white d
341| _ -> ()
342
343let dec_token ~stop d =
344 let rec loop d = match dec_byte d with
345 | 0x28 | 0x29 | 0x3B | 0x22
346 | 0x20 | 0x09 | 0x0A | 0x0B | 0x0C | 0x0D
347 | 0xFFFF -> Tdec.tok_pop d
348 | c when c = stop -> Tdec.tok_pop d
349 | _ -> Tdec.tok_accept_uchar d; loop d
350 in
351 loop d
352
353let rec dec_string ~sbyte ~sline ~stop d = match dec_byte d with
354| 0xFFFF -> err_eoi_value ~sbyte ~sline d
355| c when c = stop -> Tdec.accept_byte d; Tdec.tok_pop d
356| _ -> Tdec.tok_accept_uchar d; dec_string ~sbyte ~sline ~stop d
357
358let rec dec_tex i ~sbyte ~sline d = match dec_byte d with
359| 0xFFFF -> err_eoi_value ~sbyte ~sline d
360| 0x007D ->
361 if i = 0 then (Tdec.accept_byte d; Tdec.tok_pop d) else
362 (Tdec.tok_accept_uchar d; dec_tex (i - 1) ~sbyte ~sline d)
363| c ->
364 let i = if c = 0x007B then i + 1 else i in
365 Tdec.tok_accept_uchar d; dec_tex i ~sbyte ~sline d
366
367let dec_value d =
368 let sbyte = Tdec.pos d and sline = Tdec.line d in
369 match dec_byte d with
370 | 0x007B (* { *) -> Tdec.accept_byte d; dec_tex 0 ~sbyte ~sline d
371 | 0x0022 -> Tdec.accept_byte d; dec_string ~sbyte ~sline ~stop:0x0022 d
372 | _ -> dec_token ~stop:0x002C d
373
374let dec_field d acc =
375 let sbyte = Tdec.pos d and sline = Tdec.line d in
376 let id = dec_token ~stop:0x003D (* = *) d in
377 skip_white d;
378 match dec_byte d with
379 | 0xFFFF -> err_eoi_field ~sbyte ~sline d
380 | 0x003D (* = *) ->
381 Tdec.accept_byte d;
382 skip_white d;
383 begin match dec_byte d with
384 | 0xFFFF -> err_eoi_field ~sbyte ~sline d
385 | _ ->
386 SM.add (String.lowercase_ascii id) (dec_value d) acc
387 end
388 | _ -> err_expected d "'='"
389
390let rec dec_fields ~sbyte ~sline d acc =
391 skip_white d;
392 match dec_byte d with
393 | 0xFFFF -> err_eoi_entry ~sbyte ~sline d
394 | 0x007D (* } *) -> acc
395 | _ ->
396 let acc = dec_field d acc in
397 skip_white d;
398 match dec_byte d with
399 | 0x002C (* , *) -> Tdec.accept_byte d; dec_fields ~sbyte ~sline d acc
400 | 0x007D (* } *) -> acc
401 | 0xFFFF -> err_eoi_entry ~sbyte ~sline d
402 | _ -> err_expected d "',' or '}'"
403
404let dec_entry d =
405 let sbyte = Tdec.pos d and sline = Tdec.line d in
406 Tdec.accept_byte d (* @ *);
407 let type' = dec_token ~stop:0x007B d (* { *) in
408 match dec_byte d with
409 | 0x007B ->
410 Tdec.accept_byte d;
411 let cite_key = dec_token ~stop:0x002C d (* , *) in
412 skip_white d;
413 begin match dec_byte d with
414 | 0x002C (* , *) ->
415 Tdec.accept_byte d;
416 let fields = dec_fields ~sbyte ~sline d SM.empty in
417 Tdec.accept_byte d;
418 { type'; cite_key; fields }
419 | _ -> err_expected d "','"
420 end
421 | _ -> err_expected d "'{'"
422
423let dec_entries d =
424 let rec loop d acc =
425 skip_white d;
426 match dec_byte d with
427 | 0x0040 (* @ *) -> loop d (dec_entry d :: acc)
428 | 0xFFFF -> List.rev acc
429 | _ -> err_illegal_uchar d
430 in
431 loop d []
432
433let of_string ?(file = Fpath.v "-") s =
434 try
435 let file = Fpath.to_string file in
436 let d = Tdec.create ~file s in
437 Ok (dec_entries d)
438 with Tdec.Err (loc, msg) -> Error (msg, loc)
439
440let of_string' ?file s =
441 Result.map_error (fun e -> Fmt.str "%a" pp_error e) @@
442 (of_string ?file s)
443
444let to_string es = Fmt.str "@[<v>%a@]" (Fmt.list pp) es
445
446(*---------------------------------------------------------------------------
447 Copyright (c) 2019 University of Bern
448
449 Permission to use, copy, modify, and/or distribute this software for any
450 purpose with or without fee is hereby granted, provided that the above
451 copyright notice and this permission notice appear in all copies.
452
453 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
454 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
455 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
456 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
457 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
458 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
459 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
460 ---------------------------------------------------------------------------*)