OCaml API to the Zotero translation server to get DOI metadata
at main 460 lines 16 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2019 University of Bern. All rights reserved. 3 Distributed under the ISC license, see terms at the end of the file. 4 ---------------------------------------------------------------------------*) 5 6 7module SM = Map.Make(String) 8 9module Tloc = struct 10 type fpath = string 11 let pp_path = Format.pp_print_string 12 13 type pos = int 14 type line = int 15 16 type t = 17 { file : fpath; 18 sbyte : pos; ebyte : pos; 19 sline : pos * line; eline : pos * line } 20 21 let no_file = "-" 22 let v ~file ~sbyte ~ebyte ~sline ~eline = { file; sbyte; ebyte; sline; eline } 23 24 let pf = Format.fprintf 25 26 let pp_gnu ppf l = 27 if l.ebyte < 0 then pf ppf "%a:" pp_path l.file 28 else 29 let pp_lines ppf l = 30 let col_s = l.sbyte - snd l.sline + 1 in 31 let col_e = l.ebyte - snd l.eline + 1 in 32 if fst l.sline = fst l.eline then 33 pf ppf "%d.%d-%d" (fst l.sline) col_s col_e 34 else 35 pf ppf "%d.%d-%d.%d" (fst l.sline) col_s (fst l.eline) col_e 36 in 37 pf ppf "%a:%a" pp_path l.file pp_lines l 38 39 let pp = pp_gnu 40end 41 42module Utf_8 = struct 43 type case = 44 | L1 | L2 | L3_E0 | L3_E1_EC_or_EE_EF | L3_ED | L4_F0 | L4_F1_F3 | L4_F4 | E 45 46 let case = 47(* 48 (* See https://tools.ietf.org/html/rfc3629#section-4 *) 49 Printf.printf "[|"; 50 for i = 0 to 255 do 51 if i mod 16 = 0 then Printf.printf "\n"; 52 if 0x00 <= i && i <= 0x7F then Printf.printf "L1; " else 53 if 0xC2 <= i && i <= 0xDF then Printf.printf "L2; " else 54 if 0xE0 = i then Printf.printf "L3_E0; " else 55 if 0xE1 <= i && i <= 0xEC || 0xEE <= i && i <= 0xEF 56 then Printf.printf "L3_E1_EC_or_EE_EF; " else 57 if 0xED = i then Printf.printf "L3_ED;" else 58 if 0xF0 = i then Printf.printf "L4_F0; " else 59 if 0xF1 <= i && i <= 0xF3 then Printf.printf "L4_F1_F3; " else 60 if 0xF4 = i then Printf.printf "L4_F4; " else 61 Printf.printf "E; " 62 done; 63 Printf.printf "\n|]" 64*) 65 [| 66 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 67 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 68 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 69 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 70 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 71 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 72 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 73 L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; L1; 74 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 75 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 76 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 77 E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; E; 78 E; E; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; 79 L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; L2; 80 L3_E0; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 81 L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 82 L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 83 L3_E1_EC_or_EE_EF; L3_ED;L3_E1_EC_or_EE_EF; L3_E1_EC_or_EE_EF; 84 L4_F0; L4_F1_F3; L4_F1_F3; L4_F1_F3; L4_F4; E; E; E; E; E; E; E; E; E; E; E; 85 |] 86end 87 88module Tdec = struct 89 type t = 90 { file : Tloc.fpath; i : string; tok : Buffer.t; 91 mutable pos : int; mutable line : int; mutable line_pos : int; } 92 93 let create ?(file = Tloc.no_file) i = 94 { file; i; tok = Buffer.create 255; pos = 0; line = 1; line_pos = 0 } 95 96 let pos d = d.pos 97 let line d = d.line, d.line_pos 98 99 let loc_to_here d ~sbyte ~sline = 100 Tloc.v ~file:d.file ~sbyte ~ebyte:d.pos ~sline ~eline:(d.line, d.line_pos) 101 102 let loc_here d = loc_to_here d ~sbyte:d.pos ~sline:(d.line, d.line_pos) 103 104 exception Err of Tloc.t * string 105 106 let err loc msg = raise_notrace (Err (loc, msg)) 107 108 let err_to_here d ~sbyte ~sline fmt = 109 Format.kasprintf (err (loc_to_here d ~sbyte ~sline)) fmt 110 111 let err_here d fmt = Format.kasprintf (err (loc_here d)) fmt 112 113 let incr_line d = 114 match d.i.[d.pos] with 115 | '\r' -> d.line <- d.line + 1; d.line_pos <- d.pos + 1 116 | '\n' -> 117 (if d.pos = 0 || d.i.[d.pos - 1] <> '\r' then d.line <- d.line + 1); 118 d.line_pos <- d.pos + 1 119 | _ -> () 120 [@@ocaml.inline] 121 122 let eoi d = d.pos >= String.length d.i [@@ocaml.inline] 123 let byte d = if eoi d then 0xFFFF else Char.code d.i.[d.pos] [@@ocaml.inline] 124 let accept_byte d = incr_line d; d.pos <- d.pos + 1 [@@ocaml.inline] 125 126 let accept_utf_8 accept d = 127 let err d = 128 match byte d with 129 | 0xFFFF -> err_here d "UTF-8 decoding error: unexpected end of input" 130 | b -> err_here d "UTF-8 decoding error: byte %02x illegal here" b 131 in 132 let accept_tail d = if byte d lsr 6 = 0b10 then accept d else err d in 133 match byte d with 134 | 0xFFFF -> err d 135 | b -> 136 match Utf_8.case.(b) with 137 | L1 -> accept d 138 | L2 -> accept d; accept_tail d 139 | L3_E0 -> 140 accept d; 141 if byte d - 0xA0 < 0xBF - 0xA0 then accept d else err d; 142 accept_tail d 143 | L3_E1_EC_or_EE_EF -> accept d; accept_tail d; accept_tail d 144 | L3_ED -> 145 accept d; 146 if byte d - 0x80 < 0x9F - 0x80 then accept d else err d; 147 accept_tail d 148 | L4_F0 -> 149 accept d; 150 if byte d - 0x90 < 0xBF - 0x90 then accept d else err d; 151 accept_tail d; accept_tail d 152 | L4_F1_F3 -> 153 accept d; 154 accept_tail d; accept_tail d; accept_tail d 155 | L4_F4 -> 156 accept d; 157 if byte d - 0x80 < 0x8F - 0x80 then accept d else err d 158 | E -> err d 159 160 let tok_reset d = Buffer.reset d.tok [@@ocaml.inline] 161 let tok_pop d = let t = Buffer.contents d.tok in tok_reset d; t [@@ocaml.inline] 162 let tok_accept_byte d = Buffer.add_char d.tok d.i.[d.pos]; accept_byte d [@@ocaml.inline] 163 let tok_accept_uchar d = accept_utf_8 tok_accept_byte d [@@ocaml.inline] 164end 165 166module Url = struct 167 let string_subrange ?(first = 0) ?last s = 168 let max = String.length s - 1 in 169 let last = match last with 170 | None -> max 171 | Some l when l > max -> max 172 | Some l -> l 173 in 174 let first = if first < 0 then 0 else first in 175 if first > last then "" else String.sub s first (last - first + 1) 176 177 let alpha = function 'A' .. 'Z' | 'a' .. 'z' -> true | _ -> false 178 let digit = function '0' .. '9' -> true | _ -> false 179 180 let scheme_char c = 181 alpha c || digit c || Char.equal c '+' || Char.equal c '-' || 182 Char.equal '.' c 183 184 let find_scheme_colon u = 185 if u = "" || not (alpha u.[0]) then None 186 else 187 let max = String.length u - 1 in 188 let i = ref 1 in 189 while !i <= max && scheme_char u.[!i] do incr i done; 190 if !i > max || u.[!i] <> ':' then None else Some !i 191 192 let find_authority_last ~start u = 193 let max = String.length u - 1 in 194 if start > max then None 195 else if start + 1 > max then Some (start - 1) 196 else if not (u.[start] = '/' && u.[start + 1] = '/') then Some (start - 1) 197 else 198 let i = ref (start + 2) in 199 while !i <= max && u.[!i] <> '/' && u.[!i] <> '?' && u.[!i] <> '#' do 200 incr i 201 done; 202 Some (!i - 1) 203 204 let scheme u = 205 Option.map (fun i -> String.sub u 0 i) (find_scheme_colon u) 206 207 let path_first u = 208 let start = Option.value ~default:0 (Option.map succ (find_scheme_colon u)) in 209 let first = Option.value ~default:start (Option.map succ (find_authority_last ~start u)) in 210 let max = String.length u - 1 in 211 if first > max || u.[first] = '#' || u.[first] = '?' then None else Some first 212 213 let path_last u ~first = 214 let max = String.length u - 1 in 215 let i = ref (first + 1) in 216 while !i <= max && u.[!i] <> '?' && u.[!i] <> '#' do incr i done; 217 !i - 1 218 219 let path u = 220 Option.map (fun first -> string_subrange ~first ~last:(path_last u ~first) u) (path_first u) 221end 222 223let escape = (* The escape rules are a bit unclear. These are those of LaTeX *) 224 let byte_replaced_length char_len s = 225 let rec loop s max i l = match i > max with 226 | true -> l 227 | false -> loop s max (i + 1) (l + char_len s.[i]) 228 in 229 loop s (String.length s - 1) 0 0 230 in 231 let byte_replace set_char s ~len ~replaced_len = 232 let b = Bytes.create replaced_len in 233 let rec loop s max i k = match i > max with 234 | true -> Bytes.unsafe_to_string b 235 | false -> loop s max (i + 1) (set_char b k s.[i]) 236 in 237 loop s (len - 1) 0 0 238 in 239 let byte_escaper char_len set_char s = 240 let len = String.length s in 241 let replaced_len = byte_replaced_length char_len s in 242 match replaced_len = len with 243 | true -> s 244 | false -> byte_replace set_char s ~len ~replaced_len 245 in 246 let tilde_esc = "\\textasciitilde" in 247 let tilde_len = String.length tilde_esc in 248 let circ_esc = "\\textasciicircum" in 249 let circ_len = String.length circ_esc in 250 let bslash_esc = "\\textbackslash" in 251 let bslash_len = String.length bslash_esc in 252 let char_len = function 253 | '&' | '%' | '$' | '#' | '_' | '{' | '}' -> 2 254 | '~' -> tilde_len 255 | '^' -> circ_len 256 | '\\' -> bslash_len 257 | _ -> 1 258 in 259 let set_char b i = function 260 | '&' | '%' | '$' | '#' | '_' | '{' | '}' as c -> 261 Bytes.set b i '\\'; Bytes.set b (i + 1) c; i + 2 262 | '~' -> Bytes.blit_string tilde_esc 0 b i tilde_len; i + tilde_len 263 | '^' -> Bytes.blit_string circ_esc 0 b i circ_len; i + circ_len 264 | '\\' -> Bytes.blit_string bslash_esc 0 b i bslash_len; i + bslash_len 265 | c -> Bytes.set b i c; i + 1 266 in 267 byte_escaper char_len set_char 268 269(* TODO unescape on decode. *) 270 271type t = 272 { type' : string; 273 cite_key : string; 274 fields : string SM.t; } 275 276let v ~type' ~cite_key ~fields () = { type'; cite_key; fields } 277 278let type' e = e.type' 279let cite_key e = e.cite_key 280let fields e = e.fields 281 282let pp ppf e = 283 let pp_field ppf (k, v) = Fmt.pf ppf "@[<h>%s = {%s}@]" k (escape v) in 284 Fmt.pf ppf "@[<v2>@%s{%s,@,%a}@]" e.type' e.cite_key 285 (Fmt.iter_bindings ~sep:Fmt.comma SM.iter pp_field) e.fields 286 287(* Field values *) 288 289let list_value s = 290 List.filter (fun s -> s <> "") @@ 291 List.map String.trim (String.split_on_char ',' s) 292 293let doi e = match SM.find_opt "doi" e.fields with 294| None -> None 295| Some doi -> 296 let ret doi = match String.trim doi with 297 | "" -> None 298 | doi -> Some doi 299 in 300 (* chop scheme and authority in case there is one *) 301 match Url.scheme doi with 302 | None -> ret doi 303 | Some _ -> 304 match Url.path doi with 305 | None -> ret doi 306 | Some p -> ret p 307 308let keywords e = Option.map list_value (SM.find_opt "keywords" e.fields) 309let annote e = SM.find_opt "annote" e.fields 310 311(* Codec *) 312 313type error_kind = string 314type error = error_kind * Tloc.t 315 316let pp_error ppf (err, l) = 317 Fmt.pf ppf "@[<v>%a:@,%a: %s@]" 318 Tloc.pp l Fmt.string "Error" err 319 320let curr_char d = (* TODO better escaping (this is for error reports) *) 321 Tdec.tok_reset d; Tdec.tok_accept_uchar d; Tdec.tok_pop d 322 323let err_illegal_uchar d = Tdec.err_here d "illegal character: %s" (curr_char d) 324let err_illegal_byte d b = Tdec.err_here d "illegal character U+%04X" b 325let err_expected d exp = Tdec.err_here d "expected %s" exp 326let err_eoi msg d ~sbyte ~sline = 327 Tdec.err_to_here d ~sbyte ~sline "end of input: %s" msg 328 329let err_eoi_entry = err_eoi "unclosed BibTeX entry" 330let err_eoi_field = err_eoi "unfinished BibTeX entry field" 331let err_eoi_value = err_eoi "unfinished BibTeX field value" 332 333let dec_byte d = match Tdec.byte d with 334| c when 0x00 <= c && c <= 0x08 || 0x0E <= c && c <= 0x1F || c = 0x7F -> 335 err_illegal_byte d c 336| c -> c 337[@@ ocaml.inline] 338 339let rec skip_white d = match dec_byte d with 340| 0x20 | 0x09 | 0x0A | 0x0B | 0x0C | 0x0D -> Tdec.accept_byte d; skip_white d 341| _ -> () 342 343let dec_token ~stop d = 344 let rec loop d = match dec_byte d with 345 | 0x28 | 0x29 | 0x3B | 0x22 346 | 0x20 | 0x09 | 0x0A | 0x0B | 0x0C | 0x0D 347 | 0xFFFF -> Tdec.tok_pop d 348 | c when c = stop -> Tdec.tok_pop d 349 | _ -> Tdec.tok_accept_uchar d; loop d 350 in 351 loop d 352 353let rec dec_string ~sbyte ~sline ~stop d = match dec_byte d with 354| 0xFFFF -> err_eoi_value ~sbyte ~sline d 355| c when c = stop -> Tdec.accept_byte d; Tdec.tok_pop d 356| _ -> Tdec.tok_accept_uchar d; dec_string ~sbyte ~sline ~stop d 357 358let rec dec_tex i ~sbyte ~sline d = match dec_byte d with 359| 0xFFFF -> err_eoi_value ~sbyte ~sline d 360| 0x007D -> 361 if i = 0 then (Tdec.accept_byte d; Tdec.tok_pop d) else 362 (Tdec.tok_accept_uchar d; dec_tex (i - 1) ~sbyte ~sline d) 363| c -> 364 let i = if c = 0x007B then i + 1 else i in 365 Tdec.tok_accept_uchar d; dec_tex i ~sbyte ~sline d 366 367let dec_value d = 368 let sbyte = Tdec.pos d and sline = Tdec.line d in 369 match dec_byte d with 370 | 0x007B (* { *) -> Tdec.accept_byte d; dec_tex 0 ~sbyte ~sline d 371 | 0x0022 -> Tdec.accept_byte d; dec_string ~sbyte ~sline ~stop:0x0022 d 372 | _ -> dec_token ~stop:0x002C d 373 374let dec_field d acc = 375 let sbyte = Tdec.pos d and sline = Tdec.line d in 376 let id = dec_token ~stop:0x003D (* = *) d in 377 skip_white d; 378 match dec_byte d with 379 | 0xFFFF -> err_eoi_field ~sbyte ~sline d 380 | 0x003D (* = *) -> 381 Tdec.accept_byte d; 382 skip_white d; 383 begin match dec_byte d with 384 | 0xFFFF -> err_eoi_field ~sbyte ~sline d 385 | _ -> 386 SM.add (String.lowercase_ascii id) (dec_value d) acc 387 end 388 | _ -> err_expected d "'='" 389 390let rec dec_fields ~sbyte ~sline d acc = 391 skip_white d; 392 match dec_byte d with 393 | 0xFFFF -> err_eoi_entry ~sbyte ~sline d 394 | 0x007D (* } *) -> acc 395 | _ -> 396 let acc = dec_field d acc in 397 skip_white d; 398 match dec_byte d with 399 | 0x002C (* , *) -> Tdec.accept_byte d; dec_fields ~sbyte ~sline d acc 400 | 0x007D (* } *) -> acc 401 | 0xFFFF -> err_eoi_entry ~sbyte ~sline d 402 | _ -> err_expected d "',' or '}'" 403 404let dec_entry d = 405 let sbyte = Tdec.pos d and sline = Tdec.line d in 406 Tdec.accept_byte d (* @ *); 407 let type' = dec_token ~stop:0x007B d (* { *) in 408 match dec_byte d with 409 | 0x007B -> 410 Tdec.accept_byte d; 411 let cite_key = dec_token ~stop:0x002C d (* , *) in 412 skip_white d; 413 begin match dec_byte d with 414 | 0x002C (* , *) -> 415 Tdec.accept_byte d; 416 let fields = dec_fields ~sbyte ~sline d SM.empty in 417 Tdec.accept_byte d; 418 { type'; cite_key; fields } 419 | _ -> err_expected d "','" 420 end 421 | _ -> err_expected d "'{'" 422 423let dec_entries d = 424 let rec loop d acc = 425 skip_white d; 426 match dec_byte d with 427 | 0x0040 (* @ *) -> loop d (dec_entry d :: acc) 428 | 0xFFFF -> List.rev acc 429 | _ -> err_illegal_uchar d 430 in 431 loop d [] 432 433let of_string ?(file = Fpath.v "-") s = 434 try 435 let file = Fpath.to_string file in 436 let d = Tdec.create ~file s in 437 Ok (dec_entries d) 438 with Tdec.Err (loc, msg) -> Error (msg, loc) 439 440let of_string' ?file s = 441 Result.map_error (fun e -> Fmt.str "%a" pp_error e) @@ 442 (of_string ?file s) 443 444let to_string es = Fmt.str "@[<v>%a@]" (Fmt.list pp) es 445 446(*--------------------------------------------------------------------------- 447 Copyright (c) 2019 University of Bern 448 449 Permission to use, copy, modify, and/or distribute this software for any 450 purpose with or without fee is hereby granted, provided that the above 451 copyright notice and this permission notice appear in all copies. 452 453 THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 454 WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 455 MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 456 ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 457 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 458 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 459 OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 460 ---------------------------------------------------------------------------*)