My working unpac repository

utftrip: output U+FFFD on malformed byte sequences.

+16 -9
+16 -9
test/utftrip.ml
··· 84 (match v with `Malformed _ -> input_malformed := true | _ -> ()); 85 (pp_decode inf d) Format.std_formatter v 86 87 - 88 let dump_ inf encoding nln src = 89 let rec loop inf d = match Uutf.decode d with `Await -> assert false 90 | v -> ··· 182 (* Trip *) 183 184 let trip_ inf nln ie oe src dst = 185 - let malformed = log_malformed inf in 186 let rec loop d e = function `Await -> assert false 187 | `Uchar _ as v -> ignore (Uutf.encode e v); loop d e (Uutf.decode d) 188 | `End -> ignore (Uutf.encode e `End) 189 - | `Malformed _ as v -> malformed d v; loop d e (Uutf.decode d) 190 in 191 let d = Uutf.decoder ?nln ?encoding:ie src in 192 let e, first = match oe with ··· 203 loop d e first; close_src src 204 205 let trip_unix inf usize nln ie oe fdi fdo = 206 - let malformed = log_malformed inf in 207 let rec loop fdi fdo ds es d e = function 208 | `Uchar _ as v -> 209 encode_unix fdo es e v; loop fdi fdo ds es d e (Uutf.decode d) 210 | `End -> encode_unix fdo es e `End 211 - | `Malformed _ as v -> malformed d v; loop fdi fdo ds es d e (Uutf.decode d) 212 | `Await -> 213 let rc = unix_read fdi ds 0 (String.length ds) in 214 Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e (Uutf.decode d) ··· 331 Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE") 332 333 let cmd = 334 - let doc = "Output the input text as Unicode scalar values, one per line, 335 - in the US-ASCII charset with their position 336 - (see POSITION INFORMATION for more details)." 337 in 338 let ascii = `Ascii, Arg.info ["a"; "ascii"] ~doc in 339 - let doc = "Only guess the encoding." in 340 let guess = `Guess, Arg.info ["g"; "guess"] ~doc in 341 let doc = "Decode only, no encoding." in 342 let dec = `Decode, Arg.info ["decode"] ~doc in ··· 352 to stdout in various ways. If no input encoding is specified, 353 it is guessed. If no output encoding is specified, the input 354 encoding is used."; 355 `S "POSITION INFORMATION"; 356 `P "The format for position information is:"; 357 `P "filename:line.col:(count,byte)";
··· 84 (match v with `Malformed _ -> input_malformed := true | _ -> ()); 85 (pp_decode inf d) Format.std_formatter v 86 87 let dump_ inf encoding nln src = 88 let rec loop inf d = match Uutf.decode d with `Await -> assert false 89 | v -> ··· 181 (* Trip *) 182 183 let trip_ inf nln ie oe src dst = 184 + let malformed d v e = 185 + log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep)) 186 + in 187 let rec loop d e = function `Await -> assert false 188 | `Uchar _ as v -> ignore (Uutf.encode e v); loop d e (Uutf.decode d) 189 | `End -> ignore (Uutf.encode e `End) 190 + | `Malformed _ as v -> malformed d v e; loop d e (Uutf.decode d) 191 in 192 let d = Uutf.decoder ?nln ?encoding:ie src in 193 let e, first = match oe with ··· 204 loop d e first; close_src src 205 206 let trip_unix inf usize nln ie oe fdi fdo = 207 + let malformed d v e = 208 + log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep)) 209 + in 210 let rec loop fdi fdo ds es d e = function 211 | `Uchar _ as v -> 212 encode_unix fdo es e v; loop fdi fdo ds es d e (Uutf.decode d) 213 | `End -> encode_unix fdo es e `End 214 + | `Malformed _ as v -> malformed d v e; loop fdi fdo ds es d e (Uutf.decode d) 215 | `Await -> 216 let rc = unix_read fdi ds 0 (String.length ds) in 217 Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e (Uutf.decode d) ··· 334 Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE") 335 336 let cmd = 337 + let doc = "Output the input text as Unicode scalar values or malformed 338 + sequences, one per line, in the US-ASCII charset with their 339 + position (see POSITION INFORMATION for more details)." 340 in 341 let ascii = `Ascii, Arg.info ["a"; "ascii"] ~doc in 342 + let doc = "Only guess an UTF encoding. The result of a guess can only be 343 + UTF-8 or UTF-16{LE,BE}." 344 + in 345 let guess = `Guess, Arg.info ["g"; "guess"] ~doc in 346 let doc = "Decode only, no encoding." in 347 let dec = `Decode, Arg.info ["decode"] ~doc in ··· 357 to stdout in various ways. If no input encoding is specified, 358 it is guessed. If no output encoding is specified, the input 359 encoding is used."; 360 + `P "Invalid byte sequences in the input are reported on stderr and 361 + replaced by the Unicode replacement character (U+FFFD) in the output."; 362 `S "POSITION INFORMATION"; 363 `P "The format for position information is:"; 364 `P "filename:line.col:(count,byte)";