···44 | Float32 : float dtype
55 | Float64 : float dtype
6677+type descr = D_int8 | D_uint8 | D_float32 | D_float64
88+79type t = {
810 shape : int array;
911 fortran_order : bool;
1010- descr : string;
1212+ descr : descr;
1113 data : string;
1214}
13151414-let of_string _s = Error "not implemented"
1616+let find_substring haystack needle =
1717+ let nlen = String.length needle in
1818+ let hlen = String.length haystack in
1919+ let rec search i =
2020+ if i + nlen > hlen then None
2121+ else if String.sub haystack i nlen = needle then Some i
2222+ else search (i + 1)
2323+ in
2424+ search 0
2525+2626+let extract_quoted_value header key =
2727+ let pattern = "'" ^ key ^ "': " in
2828+ match find_substring header pattern with
2929+ | None -> Error (Printf.sprintf "missing key: %s" key)
3030+ | Some i ->
3131+ let start = i + String.length pattern in
3232+ if start >= String.length header then Error (Printf.sprintf "truncated value for key: %s" key)
3333+ else
3434+ let c = header.[start] in
3535+ if c = '\'' then
3636+ (* quoted string value *)
3737+ let value_start = start + 1 in
3838+ (match find_substring (String.sub header value_start (String.length header - value_start)) "'" with
3939+ | None -> Error (Printf.sprintf "unterminated string for key: %s" key)
4040+ | Some len -> Ok (String.sub header value_start len))
4141+ else
4242+ (* unquoted value - read until comma or } *)
4343+ let rec find_end j =
4444+ if j >= String.length header then j
4545+ else match header.[j] with
4646+ | ',' | '}' | ')' -> j
4747+ | _ -> find_end (j + 1)
4848+ in
4949+ let end_pos = find_end start in
5050+ let value = String.trim (String.sub header start (end_pos - start)) in
5151+ Ok value
5252+5353+let parse_descr s =
5454+ match s with
5555+ | "|i1" -> Ok D_int8
5656+ | "|u1" -> Ok D_uint8
5757+ | "<f4" -> Ok D_float32
5858+ | "<f8" -> Ok D_float64
5959+ | _ -> Error (Printf.sprintf "unsupported dtype: %s" s)
6060+6161+let parse_fortran_order s =
6262+ match s with
6363+ | "True" -> Ok true
6464+ | "False" -> Ok false
6565+ | _ -> Error (Printf.sprintf "invalid fortran_order: %s" s)
6666+6767+let parse_shape header =
6868+ let pattern = "'shape': (" in
6969+ match find_substring header pattern with
7070+ | None -> Error "missing shape"
7171+ | Some i ->
7272+ let start = i + String.length pattern in
7373+ (match find_substring (String.sub header start (String.length header - start)) ")" with
7474+ | None -> Error "unterminated shape"
7575+ | Some len ->
7676+ let shape_str = String.sub header start len in
7777+ let shape_str = String.trim shape_str in
7878+ if shape_str = "" then Ok [||]
7979+ else
8080+ let parts = String.split_on_char ',' shape_str in
8181+ let parts = List.filter (fun s -> String.trim s <> "") parts in
8282+ let dims = List.map (fun s -> int_of_string (String.trim s)) parts in
8383+ Ok (Array.of_list dims))
8484+8585+let of_string s =
8686+ let len = String.length s in
8787+ if len < 10 then Error "too short for .npy file"
8888+ else if String.sub s 0 6 <> "\x93NUMPY" then Error "bad magic number"
8989+ else
9090+ let major = Char.code s.[6] in
9191+ let _minor = Char.code s.[7] in
9292+ let header_len, header_offset =
9393+ if major = 1 then
9494+ let hl = Char.code s.[8] lor (Char.code s.[9] lsl 8) in
9595+ (hl, 10)
9696+ else if major = 2 then
9797+ if len < 12 then (0, 12)
9898+ else
9999+ let hl =
100100+ Char.code s.[8]
101101+ lor (Char.code s.[9] lsl 8)
102102+ lor (Char.code s.[10] lsl 16)
103103+ lor (Char.code s.[11] lsl 24)
104104+ in
105105+ (hl, 12)
106106+ else (0, 10)
107107+ in
108108+ if header_offset + header_len > len then Error "truncated header"
109109+ else
110110+ let header = String.sub s header_offset header_len in
111111+ match extract_quoted_value header "descr" with
112112+ | Error e -> Error e
113113+ | Ok descr_str ->
114114+ match parse_descr descr_str with
115115+ | Error e -> Error e
116116+ | Ok descr ->
117117+ match extract_quoted_value header "fortran_order" with
118118+ | Error e -> Error e
119119+ | Ok fo_str ->
120120+ match parse_fortran_order fo_str with
121121+ | Error e -> Error e
122122+ | Ok fortran_order ->
123123+ match parse_shape header with
124124+ | Error e -> Error e
125125+ | Ok shape ->
126126+ let data_offset = header_offset + header_len in
127127+ let data = String.sub s data_offset (len - data_offset) in
128128+ Ok { shape; fortran_order; descr; data }
1512916130let shape t = t.shape
1713118132let fortran_order t = t.fortran_order
191332020-let data_int8 _t = None
134134+let data_int8 t =
135135+ match t.descr with
136136+ | D_int8 ->
137137+ let n = String.length t.data in
138138+ let ba = Bigarray.Array1.create Bigarray.int8_signed Bigarray.c_layout n in
139139+ for i = 0 to n - 1 do
140140+ let v = Char.code t.data.[i] in
141141+ let v = if v >= 128 then v - 256 else v in
142142+ Bigarray.Array1.set ba i v
143143+ done;
144144+ Some ba
145145+ | _ -> None
146146+147147+let data_uint8 t =
148148+ match t.descr with
149149+ | D_uint8 ->
150150+ let n = String.length t.data in
151151+ let ba = Bigarray.Array1.create Bigarray.int8_unsigned Bigarray.c_layout n in
152152+ for i = 0 to n - 1 do
153153+ Bigarray.Array1.set ba i (Char.code t.data.[i])
154154+ done;
155155+ Some ba
156156+ | _ -> None
211572222-let data_uint8 _t = None
158158+let read_le_int32 s off =
159159+ let b0 = Char.code s.[off] in
160160+ let b1 = Char.code s.[off + 1] in
161161+ let b2 = Char.code s.[off + 2] in
162162+ let b3 = Char.code s.[off + 3] in
163163+ Int32.logor
164164+ (Int32.of_int b0)
165165+ (Int32.logor
166166+ (Int32.shift_left (Int32.of_int b1) 8)
167167+ (Int32.logor
168168+ (Int32.shift_left (Int32.of_int b2) 16)
169169+ (Int32.shift_left (Int32.of_int b3) 24)))
231702424-let data_float32 _t = None
171171+let read_le_int64 s off =
172172+ let b i = Int64.of_int (Char.code s.[off + i]) in
173173+ let ( lor ) = Int64.logor in
174174+ let ( lsl ) = Int64.shift_left in
175175+ (b 0) lor ((b 1) lsl 8) lor ((b 2) lsl 16) lor ((b 3) lsl 24)
176176+ lor ((b 4) lsl 32) lor ((b 5) lsl 40) lor ((b 6) lsl 48) lor ((b 7) lsl 56)
251772626-let data_float64 _t = None
178178+let data_float32 t =
179179+ match t.descr with
180180+ | D_float32 ->
181181+ let n = String.length t.data / 4 in
182182+ let ba = Bigarray.Array1.create Bigarray.float32 Bigarray.c_layout n in
183183+ for i = 0 to n - 1 do
184184+ let bits = read_le_int32 t.data (i * 4) in
185185+ Bigarray.Array1.set ba i (Int32.float_of_bits bits)
186186+ done;
187187+ Some ba
188188+ | _ -> None
189189+190190+let data_float64 t =
191191+ match t.descr with
192192+ | D_float64 ->
193193+ let n = String.length t.data / 8 in
194194+ let ba = Bigarray.Array1.create Bigarray.float64 Bigarray.c_layout n in
195195+ for i = 0 to n - 1 do
196196+ let bits = read_le_int64 t.data (i * 8) in
197197+ Bigarray.Array1.set ba i (Int64.float_of_bits bits)
198198+ done;
199199+ Some ba
200200+ | _ -> None
+67-1
tessera-npy/test/test_npy.ml
···11-let () = ()
11+let make_npy_v1 ~descr ~fortran_order ~shape data =
22+ let header =
33+ Printf.sprintf "{'descr': '%s', 'fortran_order': %s, 'shape': (%s), }"
44+ descr
55+ (if fortran_order then "True" else "False")
66+ (String.concat ", " (List.map string_of_int shape))
77+ in
88+ let prefix_len = 6 + 2 + 2 in
99+ let raw_header_len = String.length header + 1 in
1010+ let padded_len =
1111+ let total = prefix_len + raw_header_len in
1212+ let rem = total mod 64 in
1313+ if rem = 0 then raw_header_len else raw_header_len + (64 - rem)
1414+ in
1515+ let buf = Buffer.create (prefix_len + padded_len + String.length data) in
1616+ Buffer.add_string buf "\x93NUMPY";
1717+ Buffer.add_char buf '\x01';
1818+ Buffer.add_char buf '\x00';
1919+ Buffer.add_char buf (Char.chr (padded_len land 0xff));
2020+ Buffer.add_char buf (Char.chr ((padded_len lsr 8) land 0xff));
2121+ Buffer.add_string buf header;
2222+ for _ = 1 to padded_len - raw_header_len do
2323+ Buffer.add_char buf ' '
2424+ done;
2525+ Buffer.add_char buf '\n';
2626+ Buffer.add_string buf data;
2727+ Buffer.contents buf
2828+2929+let test_parse_int8_header () =
3030+ let npy = make_npy_v1 ~descr:"|i1" ~fortran_order:false ~shape:[3; 4] "\x00" in
3131+ match Npy.of_string npy with
3232+ | Error e -> Alcotest.fail e
3333+ | Ok t ->
3434+ Alcotest.(check (array int)) "shape" [|3; 4|] (Npy.shape t);
3535+ Alcotest.(check bool) "fortran_order" false (Npy.fortran_order t)
3636+3737+let test_parse_float32_header () =
3838+ let npy = make_npy_v1 ~descr:"<f4" ~fortran_order:false ~shape:[2; 3] "\x00" in
3939+ match Npy.of_string npy with
4040+ | Error e -> Alcotest.fail e
4141+ | Ok t ->
4242+ Alcotest.(check (array int)) "shape" [|2; 3|] (Npy.shape t)
4343+4444+let test_parse_3d_shape () =
4545+ let npy = make_npy_v1 ~descr:"|i1" ~fortran_order:false ~shape:[10; 20; 128] "\x00" in
4646+ match Npy.of_string npy with
4747+ | Error e -> Alcotest.fail e
4848+ | Ok t ->
4949+ Alcotest.(check (array int)) "shape" [|10; 20; 128|] (Npy.shape t)
5050+5151+let test_bad_magic () =
5252+ let npy = "NOT_NPY_DATA" in
5353+ match Npy.of_string npy with
5454+ | Ok _ -> Alcotest.fail "should have failed"
5555+ | Error _ -> ()
5656+5757+let () =
5858+ Alcotest.run "tessera-npy"
5959+ [
6060+ ( "header",
6161+ [
6262+ Alcotest.test_case "int8 header" `Quick test_parse_int8_header;
6363+ Alcotest.test_case "float32 header" `Quick test_parse_float32_header;
6464+ Alcotest.test_case "3d shape" `Quick test_parse_3d_shape;
6565+ Alcotest.test_case "bad magic" `Quick test_bad_magic;
6666+ ] );
6767+ ]