A fork of mtelver's day10 project

tessera-npy: edge case tests, polished docs, opam file

- Add trailing-comma (1D shape) and scalar shape edge case tests
- Comprehensive .mli documentation with usage example
- Auto-generated opam file

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

+155 -14
+64 -14
tessera-npy/lib/npy.mli
··· 1 - (** Read and write numpy .npy files. *) 1 + (** Read numpy [.npy] files. 2 + 3 + A portable OCaml library for parsing the numpy binary format. Data is 4 + returned as flat {!Bigarray.Array1} values in C layout; use {!shape} to 5 + interpret the dimensions. 6 + 7 + {2 Supported dtypes} 8 + 9 + {ul 10 + {- [|i1] — signed 8-bit integer} 11 + {- [|u1] — unsigned 8-bit integer} 12 + {- [<f4] — 32-bit float (little-endian)} 13 + {- [<f8] — 64-bit float (little-endian)}} 14 + 15 + {2 Example} 16 + 17 + {[ 18 + let data = In_channel.with_open_bin "embeddings.npy" In_channel.input_all in 19 + match Npy.of_string data with 20 + | Error msg -> failwith msg 21 + | Ok t -> 22 + let shape = Npy.shape t in 23 + Printf.printf "shape: %s\n" 24 + (String.concat "x" (Array.to_list (Array.map string_of_int shape))); 25 + match Npy.data_float32 t with 26 + | Some ba -> Printf.printf "first value: %f\n" (Bigarray.Array1.get ba 0) 27 + | None -> Printf.printf "not a float32 array\n" 28 + ]} *) 2 29 3 30 (** {1 Types} *) 4 31 ··· 9 36 | Float32 : float dtype 10 37 | Float64 : float dtype 11 38 12 - (** A parsed .npy file: dtype, shape, and raw data as bytes. *) 39 + (** A parsed [.npy] file. *) 13 40 type t 14 41 15 - (** {1 Reading} *) 42 + (** {1 Parsing} *) 16 43 17 44 val of_string : string -> (t, string) result 18 - (** Parse a .npy file from its raw bytes. *) 45 + (** Parse a [.npy] file from its complete contents as a string. Supports 46 + format versions 1.0 and 2.0. Returns [Error msg] if the magic bytes are 47 + wrong, the header is malformed, or the dtype is unsupported. *) 48 + 49 + (** {1 Metadata} *) 19 50 20 51 val shape : t -> int array 21 - (** The shape of the array. *) 52 + (** The shape of the array. For example, a 10x8 matrix returns [\[|10; 8|\]]. 53 + A scalar returns [\[||\]]. *) 22 54 23 55 val fortran_order : t -> bool 24 - (** Whether the data is in Fortran (column-major) order. *) 56 + (** Whether the data is stored in Fortran (column-major) order. Most numpy 57 + files use C order ([false]). *) 58 + 59 + (** {1 Data access} 25 60 26 - val data_int8 : t -> (int, Bigarray.int8_signed_elt, Bigarray.c_layout) Bigarray.Array1.t option 27 - (** Access data as int8 bigarray if the dtype matches. *) 61 + Each accessor returns [Some bigarray] if the dtype matches, or [None] 62 + otherwise. The returned {!Bigarray.Array1} is a flat (1-dimensional) view 63 + of the data in row-major order; use {!shape} to interpret the dimensions. 64 + 65 + For multi-dimensional indexing, compute the flat index as: 66 + [row * cols + col] (for 2D) or [i * (d1 * d2) + j * d2 + k] (for 3D). *) 67 + 68 + val data_int8 : 69 + t -> 70 + (int, Bigarray.int8_signed_elt, Bigarray.c_layout) Bigarray.Array1.t option 71 + (** Access data as signed 8-bit integers (numpy dtype [|i1]). *) 28 72 29 - val data_uint8 : t -> (int, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t option 30 - (** Access data as uint8 bigarray if the dtype matches. *) 73 + val data_uint8 : 74 + t -> 75 + (int, Bigarray.int8_unsigned_elt, Bigarray.c_layout) Bigarray.Array1.t option 76 + (** Access data as unsigned 8-bit integers (numpy dtype [|u1]). *) 31 77 32 - val data_float32 : t -> (float, Bigarray.float32_elt, Bigarray.c_layout) Bigarray.Array1.t option 33 - (** Access data as float32 bigarray if the dtype matches. *) 78 + val data_float32 : 79 + t -> 80 + (float, Bigarray.float32_elt, Bigarray.c_layout) Bigarray.Array1.t option 81 + (** Access data as 32-bit floats (numpy dtype [<f4], little-endian). *) 34 82 35 - val data_float64 : t -> (float, Bigarray.float64_elt, Bigarray.c_layout) Bigarray.Array1.t option 36 - (** Access data as float64 bigarray if the dtype matches. *) 83 + val data_float64 : 84 + t -> 85 + (float, Bigarray.float64_elt, Bigarray.c_layout) Bigarray.Array1.t option 86 + (** Access data as 64-bit floats (numpy dtype [<f8], little-endian). *)
+17
tessera-npy/tessera-npy.install
··· 1 + lib: [ 2 + "_build/install/default/lib/tessera-npy/META" 3 + "_build/install/default/lib/tessera-npy/dune-package" 4 + "_build/install/default/lib/tessera-npy/npy.a" 5 + "_build/install/default/lib/tessera-npy/npy.cma" 6 + "_build/install/default/lib/tessera-npy/npy.cmi" 7 + "_build/install/default/lib/tessera-npy/npy.cmt" 8 + "_build/install/default/lib/tessera-npy/npy.cmti" 9 + "_build/install/default/lib/tessera-npy/npy.cmx" 10 + "_build/install/default/lib/tessera-npy/npy.cmxa" 11 + "_build/install/default/lib/tessera-npy/npy.ml" 12 + "_build/install/default/lib/tessera-npy/npy.mli" 13 + "_build/install/default/lib/tessera-npy/opam" 14 + ] 15 + libexec: [ 16 + "_build/install/default/lib/tessera-npy/npy.cmxs" 17 + ]
+26
tessera-npy/tessera-npy.opam
··· 1 + # This file is generated by dune, edit dune-project instead 2 + opam-version: "2.0" 3 + synopsis: "Read and write numpy .npy files in OCaml" 4 + description: 5 + "A portable OCaml library for parsing the numpy .npy binary format. Supports int8, uint8, float32, and float64 dtypes with arbitrary shapes. Uses Bigarray for zero-copy data representation." 6 + license: "ISC" 7 + depends: [ 8 + "dune" {>= "3.17"} 9 + "ocaml" {>= "5.2"} 10 + "alcotest" {with-test & >= "0.8"} 11 + "odoc" {with-doc} 12 + ] 13 + build: [ 14 + ["dune" "subst"] {dev} 15 + [ 16 + "dune" 17 + "build" 18 + "-p" 19 + name 20 + "-j" 21 + jobs 22 + "@install" 23 + "@runtest" {with-test} 24 + "@doc" {with-doc} 25 + ] 26 + ]
+48
tessera-npy/test/test_npy.ml
··· 88 88 if Float.abs (v -. 3.14) > 0.001 then 89 89 Alcotest.failf "expected ~3.14, got %f" v) 90 90 91 + let make_npy_v1_raw header data = 92 + let prefix_len = 6 + 2 + 2 in 93 + let raw_header_len = String.length header + 1 in 94 + let padded_len = 95 + let total = prefix_len + raw_header_len in 96 + let rem = total mod 64 in 97 + if rem = 0 then raw_header_len else raw_header_len + (64 - rem) 98 + in 99 + let buf = Buffer.create (prefix_len + padded_len + String.length data) in 100 + Buffer.add_string buf "\x93NUMPY\x01\x00"; 101 + Buffer.add_char buf (Char.chr (padded_len land 0xff)); 102 + Buffer.add_char buf (Char.chr ((padded_len lsr 8) land 0xff)); 103 + Buffer.add_string buf header; 104 + for _ = 1 to padded_len - raw_header_len do 105 + Buffer.add_char buf ' ' 106 + done; 107 + Buffer.add_char buf '\n'; 108 + Buffer.add_string buf data; 109 + Buffer.contents buf 110 + 111 + let test_parse_1d_trailing_comma () = 112 + (* numpy writes 1D shapes as "(5,)" with trailing comma *) 113 + let header = "{'descr': '|i1', 'fortran_order': False, 'shape': (5,), }" in 114 + let npy = make_npy_v1_raw header "\x00\x00\x00\x00\x00" in 115 + match Npy.of_string npy with 116 + | Error e -> Alcotest.fail e 117 + | Ok t -> 118 + Alcotest.(check (array int)) "shape" [|5|] (Npy.shape t) 119 + 120 + let test_parse_scalar_shape () = 121 + (* scalar: shape () *) 122 + let header = "{'descr': '<f4', 'fortran_order': False, 'shape': (), }" in 123 + let bits = Int32.bits_of_float 42.0 in 124 + let data = String.init 4 (fun i -> 125 + Char.chr (Int32.to_int (Int32.logand (Int32.shift_right_logical bits (i * 8)) 0xffl)) 126 + ) in 127 + let npy = make_npy_v1_raw header data in 128 + match Npy.of_string npy with 129 + | Error e -> Alcotest.fail e 130 + | Ok t -> 131 + Alcotest.(check (array int)) "shape" [||] (Npy.shape t); 132 + (match Npy.data_float32 t with 133 + | None -> Alcotest.fail "expected float32" 134 + | Some ba -> 135 + Alcotest.(check int) "length" 1 (Bigarray.Array1.dim ba)) 136 + 91 137 let test_wrong_dtype_returns_none () = 92 138 let data = String.init 6 (fun i -> 93 139 Char.chr ([| 1; 255; 127; 128; 0; 42 |].(i)) ··· 108 154 Alcotest.test_case "float32 header" `Quick test_parse_float32_header; 109 155 Alcotest.test_case "3d shape" `Quick test_parse_3d_shape; 110 156 Alcotest.test_case "bad magic" `Quick test_bad_magic; 157 + Alcotest.test_case "1d trailing comma" `Quick test_parse_1d_trailing_comma; 158 + Alcotest.test_case "scalar shape" `Quick test_parse_scalar_shape; 111 159 ] ); 112 160 ( "data", 113 161 [