My working unpac repository

utftrip: output U+FFFD on malformed byte sequences.

+16 -9
+16 -9
test/utftrip.ml
··· 84 84 (match v with `Malformed _ -> input_malformed := true | _ -> ()); 85 85 (pp_decode inf d) Format.std_formatter v 86 86 87 - 88 87 let dump_ inf encoding nln src = 89 88 let rec loop inf d = match Uutf.decode d with `Await -> assert false 90 89 | v -> ··· 182 181 (* Trip *) 183 182 184 183 let trip_ inf nln ie oe src dst = 185 - let malformed = log_malformed inf in 184 + let malformed d v e = 185 + log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep)) 186 + in 186 187 let rec loop d e = function `Await -> assert false 187 188 | `Uchar _ as v -> ignore (Uutf.encode e v); loop d e (Uutf.decode d) 188 189 | `End -> ignore (Uutf.encode e `End) 189 - | `Malformed _ as v -> malformed d v; loop d e (Uutf.decode d) 190 + | `Malformed _ as v -> malformed d v e; loop d e (Uutf.decode d) 190 191 in 191 192 let d = Uutf.decoder ?nln ?encoding:ie src in 192 193 let e, first = match oe with ··· 203 204 loop d e first; close_src src 204 205 205 206 let trip_unix inf usize nln ie oe fdi fdo = 206 - let malformed = log_malformed inf in 207 + let malformed d v e = 208 + log_malformed inf d v; ignore (Uutf.encode e (`Uchar Uutf.u_rep)) 209 + in 207 210 let rec loop fdi fdo ds es d e = function 208 211 | `Uchar _ as v -> 209 212 encode_unix fdo es e v; loop fdi fdo ds es d e (Uutf.decode d) 210 213 | `End -> encode_unix fdo es e `End 211 - | `Malformed _ as v -> malformed d v; loop fdi fdo ds es d e (Uutf.decode d) 214 + | `Malformed _ as v -> malformed d v e; loop fdi fdo ds es d e (Uutf.decode d) 212 215 | `Await -> 213 216 let rc = unix_read fdi ds 0 (String.length ds) in 214 217 Uutf.Manual.src d ds 0 rc; loop fdi fdo ds es d e (Uutf.decode d) ··· 331 334 Arg.(value & pos 0 string "-" & info [] ~doc ~docv:"FILE") 332 335 333 336 let cmd = 334 - let doc = "Output the input text as Unicode scalar values, one per line, 335 - in the US-ASCII charset with their position 336 - (see POSITION INFORMATION for more details)." 337 + let doc = "Output the input text as Unicode scalar values or malformed 338 + sequences, one per line, in the US-ASCII charset with their 339 + position (see POSITION INFORMATION for more details)." 337 340 in 338 341 let ascii = `Ascii, Arg.info ["a"; "ascii"] ~doc in 339 - let doc = "Only guess the encoding." in 342 + let doc = "Only guess an UTF encoding. The result of a guess can only be 343 + UTF-8 or UTF-16{LE,BE}." 344 + in 340 345 let guess = `Guess, Arg.info ["g"; "guess"] ~doc in 341 346 let doc = "Decode only, no encoding." in 342 347 let dec = `Decode, Arg.info ["decode"] ~doc in ··· 352 357 to stdout in various ways. If no input encoding is specified, 353 358 it is guessed. If no output encoding is specified, the input 354 359 encoding is used."; 360 + `P "Invalid byte sequences in the input are reported on stderr and 361 + replaced by the Unicode replacement character (U+FFFD) in the output."; 355 362 `S "POSITION INFORMATION"; 356 363 `P "The format for position information is:"; 357 364 `P "filename:line.col:(count,byte)";