(** Zstandard compression implementation.

    Implements LZ77 matching, block compression, and frame encoding. *)

(** Compression level affects speed vs ratio tradeoff *)
type compression_level = {
  window_log : int;         (* Log2 of window size *)
  chain_log : int;          (* Log2 of hash chain length *)
  hash_log : int;           (* Log2 of hash table size *)
  search_log : int;         (* Number of searches per position *)
  min_match : int;          (* Minimum match length *)
  target_len : int;         (* Target match length *)
  strategy : int;           (* 0=fast, 1=greedy, 2=lazy *)
}

(** Default levels 1-19 *)
let level_params = [|
  (* Level 0/1: Fast *)
  { window_log = 17; chain_log = 12; hash_log = 11; search_log = 1; min_match = 4; target_len = 0; strategy = 0 };
  { window_log = 17; chain_log = 12; hash_log = 11; search_log = 1; min_match = 4; target_len = 0; strategy = 0 };
  (* Level 2 *)
  { window_log = 18; chain_log = 13; hash_log = 12; search_log = 1; min_match = 5; target_len = 4; strategy = 0 };
  (* Level 3 *)
  { window_log = 18; chain_log = 14; hash_log = 13; search_log = 1; min_match = 5; target_len = 8; strategy = 1 };
  (* Level 4 *)
  { window_log = 18; chain_log = 14; hash_log = 14; search_log = 2; min_match = 4; target_len = 8; strategy = 1 };
  (* Level 5 *)
  { window_log = 18; chain_log = 15; hash_log = 14; search_log = 3; min_match = 4; target_len = 16; strategy = 1 };
  (* Level 6 *)
  { window_log = 19; chain_log = 16; hash_log = 15; search_log = 3; min_match = 4; target_len = 32; strategy = 1 };
  (* Level 7 *)
  { window_log = 19; chain_log = 16; hash_log = 15; search_log = 4; min_match = 4; target_len = 32; strategy = 2 };
  (* Level 8 *)
  { window_log = 19; chain_log = 17; hash_log = 16; search_log = 4; min_match = 4; target_len = 64; strategy = 2 };
  (* Level 9 *)
  { window_log = 20; chain_log = 17; hash_log = 16; search_log = 5; min_match = 4; target_len = 64; strategy = 2 };
  (* Level 10 *)
  { window_log = 20; chain_log = 17; hash_log = 16; search_log = 6; min_match = 4; target_len = 128; strategy = 2 };
  (* Level 11 *)
  { window_log = 20; chain_log = 18; hash_log = 17; search_log = 6; min_match = 4; target_len = 128; strategy = 2 };
  (* Level 12 *)
  { window_log = 21; chain_log = 18; hash_log = 17; search_log = 7; min_match = 4; target_len = 256; strategy = 2 };
  (* Level 13 *)
  { window_log = 21; chain_log = 19; hash_log = 18; search_log = 7; min_match = 4; target_len = 256; strategy = 2 };
  (* Level 14 *)
  { window_log = 22; chain_log = 19; hash_log = 18; search_log = 8; min_match = 4; target_len = 256; strategy = 2 };
  (* Level 15 *)
  { window_log = 22; chain_log = 20; hash_log = 18; search_log = 9; min_match = 4; target_len = 256; strategy = 2 };
  (* Level 16 *)
  { window_log = 22; chain_log = 20; hash_log = 19; search_log = 10; min_match = 4; target_len = 512; strategy = 2 };
  (* Level 17 *)
  { window_log = 22; chain_log = 21; hash_log = 19; search_log = 11; min_match = 4; target_len = 512; strategy = 2 };
  (* Level 18 *)
  { window_log = 22; chain_log = 21; hash_log = 20; search_log = 12; min_match = 4; target_len = 512; strategy = 2 };
  (* Level 19 *)
  { window_log = 23; chain_log = 22; hash_log = 20; search_log = 12; min_match = 4; target_len = 1024; strategy = 2 };
|]

let get_level_params level =
  let level = max 1 (min level 19) in
  level_params.(level)

(** A sequence represents a literal run + match *)
type sequence = {
  lit_length : int;
  match_offset : int;
  match_length : int;
}

(** Hash table for fast match finding *)
type hash_table = {
  table : int array;     (* Position indexed by hash *)
  chain : int array;     (* Chain of previous matches at same hash *)
  mask : int;
}

let create_hash_table log_size =
  let size = 1 lsl log_size in
  {
    table = Array.make size (-1);
    chain = Array.make (1 lsl 20) (-1);  (* Max input size *)
    mask = size - 1;
  }

(** Compute hash of 4 bytes *)
let[@inline] hash4 src pos =
  let v = Bytes.get_int32_le src pos in
  (* MurmurHash3-like mixing *)
  let h = Int32.to_int (Int32.mul v 0xcc9e2d51l) in
  (h lxor (h lsr 15))

(** Check if positions match and return length *)
let match_length src pos1 pos2 limit =
  let len = ref 0 in
  let max_len = min (limit - pos1) (pos1 - pos2) in
  while !len < max_len &&
        Bytes.get_uint8 src (pos1 + !len) = Bytes.get_uint8 src (pos2 + !len) do
    incr len
  done;
  !len

(** Find best match at current position *)
let find_best_match ht src pos limit params =
  if pos + 4 > limit then
    (0, 0)
  else begin
    let h = hash4 src pos land ht.mask in
    let prev_pos = ht.table.(h) in

    (* Update hash table *)
    ht.chain.(pos) <- prev_pos;
    ht.table.(h) <- pos;

    if prev_pos < 0 || pos - prev_pos > (1 lsl params.window_log) then
      (0, 0)
    else begin
      (* Search chain for best match *)
      let best_offset = ref 0 in
      let best_length = ref 0 in
      let chain_pos = ref prev_pos in
      let searches = ref 0 in
      let max_searches = 1 lsl params.search_log in

      while !chain_pos >= 0 && !searches < max_searches do
        let offset = pos - !chain_pos in
        if offset > (1 lsl params.window_log) then
          chain_pos := -1
        else begin
          let len = match_length src pos !chain_pos limit in
          if len >= params.min_match && len > !best_length then begin
            best_length := len;
            best_offset := offset
          end;
          chain_pos := ht.chain.(!chain_pos);
          incr searches
        end
      done;

      (!best_offset, !best_length)
    end
  end

(** Parse input into sequences using greedy/lazy matching *)
let parse_sequences src ~pos ~len params =
  let sequences = ref [] in
  let cur_pos = ref pos in
  let limit = pos + len in
  let lit_start = ref pos in

  let ht = create_hash_table params.hash_log in

  while !cur_pos + 4 <= limit do
    let (offset, length) = find_best_match ht src !cur_pos limit params in

    if length >= params.min_match then begin
      (* Emit sequence *)
      let lit_len = !cur_pos - !lit_start in
      sequences := { lit_length = lit_len; match_offset = offset; match_length = length } :: !sequences;

      (* Update hash table for matched positions *)
      for i = !cur_pos + 1 to !cur_pos + length - 1 do
        if i + 4 <= limit then begin
          let h = hash4 src i land ht.mask in
          ht.chain.(i) <- ht.table.(h);
          ht.table.(h) <- i
        end
      done;

      cur_pos := !cur_pos + length;
      lit_start := !cur_pos
    end else begin
      incr cur_pos
    end
  done;

  (* Handle remaining literals *)
  let remaining = limit - !lit_start in
  if remaining > 0 || !sequences = [] then
    sequences := { lit_length = remaining; match_offset = 0; match_length = 0 } :: !sequences;

  List.rev !sequences

(** Encode literal length code *)
let encode_lit_length_code lit_len =
  if lit_len < 16 then
    (lit_len, 0, 0)
  else if lit_len < 64 then
    (16 + (lit_len - 16) / 4, (lit_len - 16) mod 4, 2)
  else if lit_len < 128 then
    (28 + (lit_len - 64) / 8, (lit_len - 64) mod 8, 3)
  else begin
    (* Use baseline tables for larger values *)
    let rec find_code code =
      if code >= 35 then (35, lit_len - Constants.ll_baselines.(35), Constants.ll_extra_bits.(35))
      else if lit_len < Constants.ll_baselines.(code + 1) then
        (code, lit_len - Constants.ll_baselines.(code), Constants.ll_extra_bits.(code))
      else find_code (code + 1)
    in
    find_code 16
  end

(** Minimum match length for zstd *)
let min_match = 3

(** Encode match length code *)
let encode_match_length_code match_len =
  let ml = match_len - min_match in
  if ml < 32 then
    (ml, 0, 0)
  else if ml < 64 then
    (32 + (ml - 32) / 2, (ml - 32) mod 2, 1)
  else begin
    let rec find_code code =
      if code >= 52 then (52, ml - Constants.ml_baselines.(52) + 3, Constants.ml_extra_bits.(52))
      else if ml < Constants.ml_baselines.(code + 1) - 3 then
        (code, ml - Constants.ml_baselines.(code) + 3, Constants.ml_extra_bits.(code))
      else find_code (code + 1)
    in
    find_code 32
  end

(** Encode offset code.
    Returns (of_code, extra_value, extra_bits).

    Repeat offsets use offBase 1,2,3:
    - offBase=1: ofCode=0, no extra bits
    - offBase=2: ofCode=1, extra=0 (1 bit)
    - offBase=3: ofCode=1, extra=1 (1 bit)

    Real offsets use offBase = offset + 3:
    - ofCode = highbit(offBase)
    - extra = lower ofCode bits of offBase *)
let encode_offset_code offset offset_history =
  let off_base =
    if offset = offset_history.(0) then 1
    else if offset = offset_history.(1) then 2
    else if offset = offset_history.(2) then 3
    else offset + 3
  in
  let of_code = Fse.highest_set_bit off_base in
  let extra = off_base land ((1 lsl of_code) - 1) in
  (of_code, extra, of_code)

(** Write raw literals section *)
let write_raw_literals literals ~pos ~len output ~out_pos =
  if len = 0 then begin
    (* Empty literals: single-byte header with type=0, size=0 *)
    Bytes.set_uint8 output out_pos 0;
    1
  end else if len < 32 then begin
    (* Raw literals, single stream, 1-byte header *)
    (* Header: type=0 (raw), size_format=0 (5-bit), regen_size in bits 3-7 *)
    let header = 0b00 lor ((len land 0x1f) lsl 3) in
    Bytes.set_uint8 output out_pos header;
    Bytes.blit literals pos output (out_pos + 1) len;
    1 + len
  end else if len < 4096 then begin
    (* Raw literals, 2-byte header *)
    (* type=0 (bits 0-1), size_format=1 (bits 2-3), size in bits 4-15 *)
    let header = 0b0100 lor ((len land 0x0fff) lsl 4) in
    Bytes.set_uint16_le output out_pos header;
    Bytes.blit literals pos output (out_pos + 2) len;
    2 + len
  end else begin
    (* Raw literals, 3-byte header *)
    (* type=0 (bits 0-1), size_format=2 (bits 2-3), size in bits 4-17 (14 bits) *)
    let header = 0b1000 lor ((len land 0x3fff) lsl 4) in
    Bytes.set_uint8 output out_pos (header land 0xff);
    Bytes.set_uint8 output (out_pos + 1) ((header lsr 8) land 0xff);
    Bytes.set_uint8 output (out_pos + 2) ((header lsr 16) land 0xff);
    Bytes.blit literals pos output (out_pos + 3) len;
    3 + len
  end

(** Write compressed literals with Huffman encoding *)
let write_compressed_literals literals ~pos ~len output ~out_pos =
  if len < 32 then
    (* Too small for Huffman, use raw *)
    write_raw_literals literals ~pos ~len output ~out_pos
  else begin
    (* Count symbol frequencies *)
    let counts = Array.make 256 0 in
    for i = pos to pos + len - 1 do
      let c = Bytes.get_uint8 literals i in
      counts.(c) <- counts.(c) + 1
    done;

    (* Find max symbol used *)
    let max_symbol = ref 0 in
    for i = 0 to 255 do
      if counts.(i) > 0 then max_symbol := i
    done;

    (* Build Huffman table *)
    let ctable = Huffman.build_ctable counts !max_symbol Constants.max_huffman_bits in

    if ctable.num_symbols = 0 then
      write_raw_literals literals ~pos ~len output ~out_pos
    else begin
      (* Decide single vs 4-stream based on size *)
      let use_4streams = len >= 256 in

      (* Write Huffman table header to temp buffer *)
      let header_buf = Bytes.create 256 in
      let header_stream = Bit_writer.Forward.of_bytes header_buf in
      let _num_written = Huffman.write_header header_stream ctable in
      let header_size = Bit_writer.Forward.byte_position header_stream in

      (* Compress literals *)
      let compressed =
        if use_4streams then
          Huffman.compress_4stream ctable literals ~pos ~len
        else
          Huffman.compress_1stream ctable literals ~pos ~len
      in
      let compressed_size = Bytes.length compressed in

      (* Check if compression is worthwhile (should save at least 10%) *)
      let total_compressed_size = header_size + compressed_size in
      if total_compressed_size >= len - len / 10 then
        write_raw_literals literals ~pos ~len output ~out_pos
      else begin
        (* Write compressed literals header *)
        (* Type: 2 = compressed, size_format based on sizes *)
        let regen_size = len in
        let lit_type = 2 in  (* Compressed_literals *)

        let header_pos = ref out_pos in
        if regen_size < 1024 && total_compressed_size < 1024 then begin
          (* 3-byte header: type(2) + size_format(2) + regen(10) + compressed(10) + streams(2) *)
          let size_format = 0 in
          let streams_flag = if use_4streams then 3 else 0 in
          let h0 = lit_type lor (size_format lsl 2) lor (streams_flag lsl 4) lor ((regen_size land 0x3f) lsl 6) in
          let h1 = ((regen_size lsr 6) land 0xf) lor ((total_compressed_size land 0xf) lsl 4) in
          let h2 = (total_compressed_size lsr 4) land 0xff in
          Bytes.set_uint8 output !header_pos h0;
          Bytes.set_uint8 output (!header_pos + 1) h1;
          Bytes.set_uint8 output (!header_pos + 2) h2;
          header_pos := !header_pos + 3
        end else begin
          (* 5-byte header for larger sizes *)
          let size_format = 1 in
          let streams_flag = if use_4streams then 3 else 0 in
          let h0 = lit_type lor (size_format lsl 2) lor (streams_flag lsl 4) lor ((regen_size land 0x3f) lsl 6) in
          Bytes.set_uint8 output !header_pos h0;
          Bytes.set_uint16_le output (!header_pos + 1) (((regen_size lsr 6) land 0x3fff) lor ((total_compressed_size land 0x3) lsl 14));
          Bytes.set_uint16_le output (!header_pos + 3) ((total_compressed_size lsr 2) land 0xffff);
          header_pos := !header_pos + 5
        end;

        (* Write Huffman table *)
        Bytes.blit header_buf 0 output !header_pos header_size;
        header_pos := !header_pos + header_size;

        (* Write compressed streams *)
        Bytes.blit compressed 0 output !header_pos compressed_size;

        !header_pos + compressed_size - out_pos
      end
    end
  end

(** Compress literals - try Huffman, fall back to raw *)
let compress_literals literals ~pos ~len output ~out_pos =
  write_compressed_literals literals ~pos ~len output ~out_pos

(** Build predefined FSE compression tables *)
let ll_ctable = lazy (Fse.build_predefined_ctable Constants.ll_default_distribution Constants.ll_default_accuracy_log)
let ml_ctable = lazy (Fse.build_predefined_ctable Constants.ml_default_distribution Constants.ml_default_accuracy_log)
let of_ctable = lazy (Fse.build_predefined_ctable Constants.of_default_distribution Constants.of_default_accuracy_log)

(** Compress sequences section using predefined FSE tables.
    This implements proper zstd sequence encoding following RFC 8878.

    Matches C zstd's ZSTD_encodeSequences_body exactly:
    1. Initialize states with FSE_initCState2 using LAST sequence's codes
    2. Write LAST sequence's extra bits (LL, ML, OF order)
    3. For sequences n-2 down to 0:
       - FSE_encodeSymbol for OF, ML, LL
       - Extra bits for LL, ML, OF
    4. FSE_flushCState for ML, OF, LL
*)
let compress_sequences sequences output ~out_pos offset_history =
  if sequences = [] then begin
    (* Zero sequences *)
    Bytes.set_uint8 output out_pos 0;
    1
  end else begin
    let num_seq = List.length sequences in
    let header_size = ref 0 in

    (* Write sequence count (1-3 bytes) *)
    if num_seq < 128 then begin
      Bytes.set_uint8 output out_pos num_seq;
      header_size := 1
    end else if num_seq < 0x7f00 then begin
      Bytes.set_uint8 output out_pos ((num_seq lsr 8) + 128);
      Bytes.set_uint8 output (out_pos + 1) (num_seq land 0xff);
      header_size := 2
    end else begin
      Bytes.set_uint8 output out_pos 0xff;
      Bytes.set_uint16_le output (out_pos + 1) (num_seq - 0x7f00);
      header_size := 3
    end;

    (* Symbol compression modes byte:
       bits 0-1: Literals_Lengths_Mode (0 = predefined)
       bits 2-3: Offsets_Mode (0 = predefined)
       bits 4-5: Match_Lengths_Mode (0 = predefined)
       bits 6-7: reserved *)
    Bytes.set_uint8 output (out_pos + !header_size) 0b00;
    incr header_size;

    (* Get predefined FSE tables *)
    let ll_ct = Lazy.force ll_ctable in
    let ml_ct = Lazy.force ml_ctable in
    let of_ct = Lazy.force of_ctable in

    let offset_hist = Array.copy offset_history in
    let seq_array = Array.of_list sequences in

    (* Encode all sequences in forward order to track offset history *)
    let encoded = Array.map (fun seq ->
      let (ll_code, ll_extra, ll_extra_bits) = encode_lit_length_code seq.lit_length in
      let (ml_code, ml_extra, ml_extra_bits) = encode_match_length_code seq.match_length in
      let (of_code, of_extra, of_extra_bits) = encode_offset_code seq.match_offset offset_hist in

      (* Update offset history for real offsets (of_code > 1 means offBase > 2) *)
      if seq.match_offset > 0 && of_code > 1 then begin
        offset_hist.(2) <- offset_hist.(1);
        offset_hist.(1) <- offset_hist.(0);
        offset_hist.(0) <- seq.match_offset
      end;

      (ll_code, ll_extra, ll_extra_bits, ml_code, ml_extra, ml_extra_bits, of_code, of_extra, of_extra_bits)
    ) seq_array in

    (* Use a backward bit writer *)
    let stream = Bit_writer.Backward.create (num_seq * 20 + 32) in

    (* Get last sequence's codes for state initialization *)
    let last_idx = num_seq - 1 in
    let (ll_code_last, ll_extra_last, ll_extra_bits_last,
         ml_code_last, ml_extra_last, ml_extra_bits_last,
         of_code_last, of_extra_last, of_extra_bits_last) = encoded.(last_idx) in

    (* Initialize FSE states with LAST sequence's codes *)
    let ll_state = Fse.init_cstate2 ll_ct ll_code_last in
    let ml_state = Fse.init_cstate2 ml_ct ml_code_last in
    let of_state = Fse.init_cstate2 of_ct of_code_last in

    (* Write LAST sequence's extra bits first (LL, ML, OF order) *)
    if ll_extra_bits_last > 0 then
      Bit_writer.Backward.write_bits stream ll_extra_last ll_extra_bits_last;
    if ml_extra_bits_last > 0 then
      Bit_writer.Backward.write_bits stream ml_extra_last ml_extra_bits_last;
    if of_extra_bits_last > 0 then
      Bit_writer.Backward.write_bits stream of_extra_last of_extra_bits_last;

    (* Process sequences from n-2 down to 0 *)
    for i = last_idx - 1 downto 0 do
      let (ll_code, ll_extra, ll_extra_bits,
           ml_code, ml_extra, ml_extra_bits,
           of_code, of_extra, of_extra_bits) = encoded.(i) in

      (* FSE encode: OF, ML, LL order *)
      Fse.encode_symbol stream of_state of_code;
      Fse.encode_symbol stream ml_state ml_code;
      Fse.encode_symbol stream ll_state ll_code;

      (* Extra bits: LL, ML, OF order *)
      if ll_extra_bits > 0 then
        Bit_writer.Backward.write_bits stream ll_extra ll_extra_bits;
      if ml_extra_bits > 0 then
        Bit_writer.Backward.write_bits stream ml_extra ml_extra_bits;
      if of_extra_bits > 0 then
        Bit_writer.Backward.write_bits stream of_extra of_extra_bits
    done;

    (* Flush states: ML, OF, LL order *)
    Fse.flush_cstate stream ml_state;
    Fse.flush_cstate stream of_state;
    Fse.flush_cstate stream ll_state;

    (* Finalize and copy to output *)
    let seq_data = Bit_writer.Backward.finalize stream in
    let seq_len = Bytes.length seq_data in
    Bytes.blit seq_data 0 output (out_pos + !header_size) seq_len;

    !header_size + seq_len
  end

(** Write raw block (no compression) *)
let write_raw_block src ~pos ~len output ~out_pos =
  (* Raw block: header (3 bytes) + raw data
     Header format: bit 0 = last_block, bits 1-2 = block_type, bits 3-23 = block_size
     For raw: block_type = 0, block_size = number of bytes *)
  let header = (Constants.block_raw lsl 1) lor ((len land 0x1fffff) lsl 3) in
  Bytes.set_uint8 output out_pos (header land 0xff);
  Bytes.set_uint8 output (out_pos + 1) ((header lsr 8) land 0xff);
  Bytes.set_uint8 output (out_pos + 2) ((header lsr 16) land 0xff);
  Bytes.blit src pos output (out_pos + 3) len;
  3 + len

(** Write compressed block with sequences *)
let write_compressed_block src ~pos ~len sequences output ~out_pos offset_history =
  (* Collect all literals *)
  let total_lit_len = List.fold_left (fun acc seq -> acc + seq.lit_length) 0 sequences in
  let literals = Bytes.create total_lit_len in
  let lit_pos = ref 0 in
  let src_pos = ref pos in
  List.iter (fun seq ->
    if seq.lit_length > 0 then begin
      Bytes.blit src !src_pos literals !lit_pos seq.lit_length;
      lit_pos := !lit_pos + seq.lit_length;
      src_pos := !src_pos + seq.lit_length
    end;
    src_pos := !src_pos + seq.match_length
  ) sequences;

  (* Build block content in temp buffer *)
  let block_buf = Bytes.create (len * 2 + 256) in
  let block_pos = ref 0 in

  (* Write literals section *)
  let lit_size = compress_literals literals ~pos:0 ~len:total_lit_len block_buf ~out_pos:!block_pos in
  block_pos := !block_pos + lit_size;

  (* Filter out sequences with only literals (match_length = 0 and match_offset = 0)
     at the end - the last sequence can be literal-only *)
  let real_sequences = List.filter (fun seq ->
    seq.match_length > 0 || seq.match_offset > 0
  ) sequences in

  (* Write sequences section *)
  let seq_size = compress_sequences real_sequences block_buf ~out_pos:!block_pos offset_history in
  block_pos := !block_pos + seq_size;

  let block_size = !block_pos in

  (* Check if compressed block is actually smaller *)
  if block_size >= len then begin
    (* Fall back to raw block *)
    write_raw_block src ~pos ~len output ~out_pos
  end else begin
    (* Write compressed block header *)
    let header = (Constants.block_compressed lsl 1) lor ((block_size land 0x1fffff) lsl 3) in
    Bytes.set_uint8 output out_pos (header land 0xff);
    Bytes.set_uint8 output (out_pos + 1) ((header lsr 8) land 0xff);
    Bytes.set_uint8 output (out_pos + 2) ((header lsr 16) land 0xff);
    Bytes.blit block_buf 0 output (out_pos + 3) block_size;
    3 + block_size
  end

(** Write RLE block (single byte repeated) *)
let write_rle_block byte len output ~out_pos =
  (* RLE block: header (3 bytes) + single byte
     Header format: bit 0 = last_block, bits 1-2 = block_type, bits 3-23 = regen_size
     For RLE: block_type = 1, regen_size = number of bytes when expanded *)
  let header = (Constants.block_rle lsl 1) lor ((len land 0x1fffff) lsl 3) in
  Bytes.set_uint8 output out_pos (header land 0xff);
  Bytes.set_uint8 output (out_pos + 1) ((header lsr 8) land 0xff);
  Bytes.set_uint8 output (out_pos + 2) ((header lsr 16) land 0xff);
  Bytes.set_uint8 output (out_pos + 3) byte;
  4

(** Check if block is all same byte *)
let is_rle_block src ~pos ~len =
  if len = 0 then None
  else begin
    let first = Bytes.get_uint8 src pos in
    let all_same = ref true in
    for i = pos + 1 to pos + len - 1 do
      if Bytes.get_uint8 src i <> first then all_same := false
    done;
    if !all_same then Some first else None
  end

(** Compress a single block using LZ77 + FSE + Huffman.
    Falls back to RLE for repetitive data, or raw blocks if compression doesn't help. *)
let compress_block src ~pos ~len output ~out_pos params offset_history =
  if len = 0 then
    0
  else
    (* Check for RLE opportunity (all same byte) *)
    match is_rle_block src ~pos ~len with
    | Some byte when len > 4 ->
      (* RLE is worthwhile: 4 bytes instead of len+3 *)
      write_rle_block byte len output ~out_pos
    | _ ->
      (* Try LZ77 + FSE compression for compressible data *)
      let sequences = parse_sequences src ~pos ~len params in
      let match_count = List.fold_left (fun acc s ->
        if s.match_length > 0 then acc + 1 else acc) 0 sequences in
      (* Use compressed blocks for compressible data. The backward bitstream
         writer now uses periodic flushing like C zstd, supporting any size. *)
      if match_count >= 2 && len >= 64 then
        write_compressed_block src ~pos ~len sequences output ~out_pos offset_history
      else
        write_raw_block src ~pos ~len output ~out_pos

(** Write frame header *)
let write_frame_header output ~pos content_size window_log checksum_flag =
  (* Magic number *)
  Bytes.set_int32_le output pos Constants.zstd_magic_number;
  let out_pos = ref (pos + 4) in

  (* Use single segment mode for smaller content (no window descriptor needed).
     FCS field sizes when single_segment is set:
     - fcs_flag=0: 1 byte (content size 0-255)
     - fcs_flag=1: 2 bytes (content size 256-65791, stored with -256)
     - fcs_flag=2: 4 bytes
     - fcs_flag=3: 8 bytes *)
  let single_segment = content_size <= 131072L in

  let (fcs_flag, fcs_bytes) =
    if single_segment then begin
      if content_size <= 255L then (0, 1)
      else if content_size <= 65791L then (1, 2)  (* 2-byte has +256 offset *)
      else if content_size <= 0xFFFFFFFFL then (2, 4)
      else (3, 8)
    end else begin
      (* For non-single-segment, fcs_flag=0 means no FCS field *)
      if content_size = 0L then (0, 0)
      else if content_size <= 65535L then (1, 2)
      else if content_size <= 0xFFFFFFFFL then (2, 4)
      else (3, 8)
    end
  in

  (* Frame header descriptor:
     bit 0-1: dict ID flag (0 = no dict)
     bit 2: content checksum flag
     bit 3: reserved
     bit 4: unused
     bit 5: single segment (no window descriptor)
     bit 6-7: FCS field size flag *)
  let descriptor =
    (if checksum_flag then 0b00000100 else 0)
    lor (if single_segment then 0b00100000 else 0)
    lor (fcs_flag lsl 6)
  in
  Bytes.set_uint8 output !out_pos descriptor;
  incr out_pos;

  (* Window descriptor (only if not single segment) *)
  if not single_segment then begin
    let window_desc = ((window_log - 10) lsl 3) in
    Bytes.set_uint8 output !out_pos window_desc;
    incr out_pos
  end;

  (* Frame content size *)
  begin match fcs_bytes with
  | 1 ->
    Bytes.set_uint8 output !out_pos (Int64.to_int content_size);
    out_pos := !out_pos + 1
  | 2 ->
    (* 2-byte FCS stores value - 256 *)
    let adjusted = Int64.sub content_size 256L in
    Bytes.set_uint16_le output !out_pos (Int64.to_int adjusted);
    out_pos := !out_pos + 2
  | 4 ->
    Bytes.set_int32_le output !out_pos (Int64.to_int32 content_size);
    out_pos := !out_pos + 4
  | 8 ->
    Bytes.set_int64_le output !out_pos content_size;
    out_pos := !out_pos + 8
  | _ -> ()
  end;

  !out_pos - pos

(** Compress data to zstd frame *)
let compress ?(level = 3) ?(checksum = true) src =
  let src = Bytes.of_string src in
  let len = Bytes.length src in
  let params = get_level_params level in

  (* Allocate output buffer - worst case is slightly larger than input *)
  let max_output = len + len / 128 + 256 in
  let output = Bytes.create max_output in

  (* Initialize offset history *)
  let offset_history = Array.copy Constants.initial_repeat_offsets in

  (* Write frame header *)
  let header_size = write_frame_header output ~pos:0 (Int64.of_int len) params.window_log checksum in
  let out_pos = ref header_size in

  (* Compress blocks *)
  if len = 0 then begin
    (* Empty content: write an empty raw block with last_block flag *)
    (* Block header: last_block=1, block_type=raw(0), block_size=0 *)
    (* Header = 1 | (0 << 1) | (0 << 3) = 0x01 *)
    Bytes.set_uint8 output !out_pos 0x01;
    Bytes.set_uint8 output (!out_pos + 1) 0x00;
    Bytes.set_uint8 output (!out_pos + 2) 0x00;
    out_pos := !out_pos + 3
  end else begin
    let block_size = min len Constants.block_size_max in
    let pos = ref 0 in

    while !pos < len do
      let this_block = min block_size (len - !pos) in
      let is_last = !pos + this_block >= len in

      let block_len = compress_block src ~pos:!pos ~len:this_block output ~out_pos:!out_pos params offset_history in

      (* Set last block flag *)
      if is_last then begin
        let current = Bytes.get_uint8 output !out_pos in
        Bytes.set_uint8 output !out_pos (current lor 0x01)
      end;

      out_pos := !out_pos + block_len;
      pos := !pos + this_block
    done
  end;

  (* Write checksum if requested *)
  if checksum then begin
    let hash = Xxhash.hash64 src ~pos:0 ~len in
    (* Write only lower 32 bits *)
    Bytes.set_int32_le output !out_pos (Int64.to_int32 hash);
    out_pos := !out_pos + 4
  end;

  Bytes.sub_string output 0 !out_pos

(** Calculate maximum compressed size *)
let compress_bound len =
  len + len / 128 + 256

(** Write a skippable frame.
    @param variant Magic number variant 0-15
    @param content The content to embed in the skippable frame
    @return The complete skippable frame as a string *)
let write_skippable_frame ?(variant = 0) content =
  let variant = max 0 (min 15 variant) in
  let len = String.length content in
  if len > 0xFFFFFFFF then
    invalid_arg "Skippable frame content too large (max 4GB)";
  let output = Bytes.create (Constants.skippable_header_size + len) in
  (* Magic number: 0x184D2A50 + variant *)
  let magic = Int32.add Constants.skippable_magic_start (Int32.of_int variant) in
  Bytes.set_int32_le output 0 magic;
  (* Content size (4 bytes little-endian) *)
  Bytes.set_int32_le output 4 (Int32.of_int len);
  (* Content *)
  Bytes.blit_string content 0 output 8 len;
  Bytes.unsafe_to_string output