OCaml HTML5 parser/serialiser based on Python's JustHTML

OCaml Fuzzing with Crowbar#

This guide covers setting up property-based fuzz testing for OCaml libraries using Crowbar. Crowbar supports two modes:

  1. QuickCheck mode - Fast, deterministic testing without AFL (good for CI)
  2. AFL mode - Coverage-guided fuzzing with American Fuzzy Lop

Quick Start#

1. Add Dependencies#

Add crowbar to your opam dependencies:

opam install crowbar

2. Create Fuzz Test Directory#

Create a fuzz/ directory at the root of your project:

project/
├── lib/
├── fuzz/
│   ├── dune
│   └── fuzz_mylib.ml
└── dune-project

3. Add Dune Configuration#

Create fuzz/dune:

(executable
 (name fuzz_mylib)
 (libraries mylib crowbar))

For multiple fuzz targets:

(executable
 (name fuzz_parser)
 (libraries mylib crowbar))

(executable
 (name fuzz_encoder)
 (libraries mylib crowbar))

4. Write Fuzz Tests#

See Test Structure below for patterns and examples.

5. Run Tests#

QuickCheck mode (no AFL required):

dune exec ./fuzz/fuzz_mylib.exe

AFL mode (requires AFL instrumentation):

# Build with AFL instrumentation
dune build --context afl

# Run AFL fuzzer
mkdir -p input_corpus output
echo "seed" > input_corpus/seed
afl-fuzz -i input_corpus -o output -- _build/afl/fuzz/fuzz_mylib.exe @@

Test Structure#

Basic Test Pattern#

open Crowbar

(* Property: operation shouldn't crash on arbitrary input *)
let () =
  add_test ~name:"mylib_no_crash" [ bytes ] @@ fun input ->
  match MyLib.parse input with
  | Ok _ | Error _ -> check true

(* Property: roundtrip encode/decode *)
let () =
  add_test ~name:"mylib_roundtrip" [ int ] @@ fun n ->
  let encoded = MyLib.encode n in
  match MyLib.decode encoded with
  | Error e -> failwith ("roundtrip failed: " ^ e)
  | Ok decoded -> check (decoded = n)

Crowbar Generators#

Crowbar provides generators for creating test inputs:

open Crowbar

(* Primitive generators *)
let _ = bool        (* bool *)
let _ = int         (* int (small range in quickcheck mode) *)
let _ = int8        (* 0-255 *)
let _ = int32       (* int32 *)
let _ = int64       (* int64 *)
let _ = float       (* float *)
let _ = bytes       (* arbitrary byte string *)
let _ = bytes_fixed 10  (* exactly 10 bytes *)

(* Range generator *)
let _ = range 100   (* 0 to 99 *)

(* Choice from constants *)
let _ = choose [
  const "hello";
  const "world";
  const "";
]

(* Mapping over generators *)
let positive_int = map [int] (fun n -> abs n)

(* Optional values *)
let maybe_int = option int  (* int option *)

(* Lists *)
let int_list = list int     (* int list *)

(* Pairs and tuples *)
let pair = pair int bytes   (* int * bytes *)

Common Test Patterns#

1. Crash Resistance (Robustness)#

Test that arbitrary input doesn't cause crashes:

let () =
  add_test ~name:"parser_no_crash" [ bytes ] @@ fun input ->
  (* Should never raise an exception *)
  let _ = try Parser.parse input with _ -> [] in
  check true

2. Roundtrip Property#

Test encode/decode symmetry:

let () =
  add_test ~name:"codec_roundtrip" [ int ] @@ fun value ->
  match encode value with
  | Error e -> failwith ("encode failed: " ^ e)
  | Ok encoded ->
    match decode encoded with
    | Error e -> failwith ("decode failed: " ^ e)
    | Ok decoded -> check (decoded = value)

3. Idempotence#

Test that applying operation twice equals applying once:

let () =
  add_test ~name:"normalize_idempotent" [ bytes ] @@ fun input ->
  let once = normalize input in
  let twice = normalize once in
  check (once = twice)

4. Known Edge Cases#

Test specific problematic inputs:

let () =
  add_test ~name:"empty_input" [ const () ] @@ fun () ->
  match parse "" with
  | Ok _ | Error _ -> check true

let () =
  add_test ~name:"boundary_values"
    [ choose [
        const 0;
        const (-1);
        const max_int;
        const min_int;
      ] ]
  @@ fun n ->
  match process n with
  | Ok _ | Error _ -> check true

5. Type Mismatch Handling#

Test graceful handling of wrong types:

let () =
  add_test ~name:"type_mismatch" [ bytes ] @@ fun input ->
  (* Parsing arbitrary bytes as int should fail gracefully *)
  match decode_int input with
  | Ok _ -> check true  (* unexpected success is fine *)
  | Error _ -> check true  (* expected failure *)

6. Comparison with Reference Implementation#

let () =
  add_test ~name:"matches_reference" [ int; int ] @@ fun a b ->
  let result = MyLib.add a b in
  let expected = a + b in
  check (result = expected)

Handling Expected Failures#

When some inputs are expected to fail:

(* Large integers may exceed safe precision *)
let max_safe_int = 9007199254740991

let () =
  add_test ~name:"large_int_handling" [ range max_int ] @@ fun n ->
  match encode n with
  | Error _ ->
    (* Large ints beyond safe precision correctly rejected *)
    check (n > max_safe_int)
  | Ok encoded ->
    match decode encoded with
    | Error _ -> check (n > max_safe_int)
    | Ok decoded -> check (decoded = n)

Float Comparison#

Use epsilon comparison for floating-point roundtrips:

let () =
  add_test ~name:"float_roundtrip"
    [ choose [ const 0.0; const 3.14; const (-2.5) ] ]
  @@ fun f ->
  match encode f |> Result.bind decode with
  | Error e -> failwith e
  | Ok decoded ->
    (* Use epsilon comparison for floats *)
    check (Float.abs (decoded -. f) < 0.0001)

AFL Fuzzing Setup#

Prerequisites#

  • OCaml compiler with AFL support
  • AFL or AFL++ installed
  • Dune 3.0+

Dune Workspace Configuration#

Create or update dune-workspace at project root:

(lang dune 3.20)

(context default)

(context
 (default
  (name afl)
  (profile afl)))

(env
 (afl
  (ocamlopt_flags (:standard -afl-instrument))))

This creates an afl build context that instruments binaries for coverage-guided fuzzing.

Building for AFL#

dune build --context afl

Instrumented binaries are output to _build/afl/.

Running AFL#

# Create input corpus with seed files
mkdir -p input_corpus
echo '{"key": "value"}' > input_corpus/json1
echo 'key: value' > input_corpus/yaml1

# Run AFL fuzzer
afl-fuzz -i input_corpus -o findings -- _build/afl/fuzz/fuzz_mylib.exe @@

Common AFL Issues#

Core Pattern Error#

AFL: Oops, your system is configured to send core dump notifications to an external utility...

Fix:

sudo su -c 'echo core > /proc/sys/kernel/core_pattern'

CPU Scaling Error#

AFL: The CPU frequency scaling governor is set to "powersave"...

Fix:

sudo cpufreq-set -g performance
# Or use AFL_SKIP_CPUFREQ=1 to ignore (not recommended)

Complete Example#

Here's a complete fuzz test file for a YAML library:

(** Property-based tests for MyYaml using Crowbar *)

open Crowbar

(* Helper functions *)
let decode codec s =
  let reader = Bytesrw.Bytes.Reader.of_string s in
  MyYaml.decode codec reader

let encode codec v =
  let buf = Buffer.create 256 in
  let writer = Bytesrw.Bytes.Writer.of_buffer buf in
  match MyYaml.encode codec v writer with
  | Ok () -> Ok (Buffer.contents buf)
  | Error e -> Error e

(* Test: Arbitrary input shouldn't crash *)
let () =
  add_test ~name:"yaml_no_crash" [ bytes ] @@ fun input ->
  match decode Codec.string input with
  | Ok _ | Error _ -> check true

(* Test: String roundtrip *)
let () =
  add_test ~name:"yaml_string_roundtrip"
    [ choose [ const "hello"; const ""; const "with\nnewline" ] ]
  @@ fun s ->
  match encode Codec.string s with
  | Error e -> failwith ("encode: " ^ e)
  | Ok yaml ->
    match decode Codec.string yaml with
    | Error e -> failwith ("decode: " ^ e)
    | Ok decoded -> check (decoded = s)

(* Test: Int roundtrip *)
let () =
  add_test ~name:"yaml_int_roundtrip" [ range 1000000 ] @@ fun n ->
  match encode Codec.int n with
  | Error e -> failwith ("encode: " ^ e)
  | Ok yaml ->
    match decode Codec.int yaml with
    | Error e -> failwith ("decode: " ^ e)
    | Ok decoded -> check (decoded = n)

(* Test: Bool roundtrip *)
let () =
  add_test ~name:"yaml_bool_roundtrip" [ bool ] @@ fun b ->
  match encode Codec.bool b with
  | Error e -> failwith ("encode: " ^ e)
  | Ok yaml ->
    match decode Codec.bool yaml with
    | Error e -> failwith ("decode: " ^ e)
    | Ok decoded -> check (decoded = b)

(* Test: List roundtrip *)
let () =
  add_test ~name:"yaml_list_roundtrip"
    [ choose [ const []; const [1;2;3]; const [0] ] ]
  @@ fun lst ->
  let codec = Codec.list Codec.int in
  match encode codec lst with
  | Error e -> failwith ("encode: " ^ e)
  | Ok yaml ->
    match decode codec yaml with
    | Error e -> failwith ("decode: " ^ e)
    | Ok decoded -> check (decoded = lst)

(* Test: Empty input *)
let () =
  add_test ~name:"yaml_empty" [ const () ] @@ fun () ->
  match decode Codec.string "" with
  | Ok _ | Error _ -> check true

(* Test: Nested structures *)
let () =
  add_test ~name:"yaml_nested"
    [ choose [ const [[]]; const [[1;2];[3;4]] ] ]
  @@ fun nested ->
  let codec = Codec.list (Codec.list Codec.int) in
  match encode codec nested with
  | Error e -> failwith ("encode: " ^ e)
  | Ok yaml ->
    match decode codec yaml with
    | Error e -> failwith ("decode: " ^ e)
    | Ok decoded -> check (decoded = nested)

Tips for Writing Effective Fuzz Tests#

  1. Start with crash resistance - The most basic test ensures arbitrary input doesn't crash your code.

  2. Test roundtrips - If you have encode/decode pairs, roundtrip tests catch many bugs.

  3. Use choose for edge cases - Combine random generation with known problematic values.

  4. Keep generators focused - Generate only valid inputs for properties that require them.

  5. Handle expected failures gracefully - Use pattern matching to distinguish expected vs unexpected failures.

  6. Test at boundaries - Empty inputs, maximum values, and type boundaries often reveal bugs.

  7. Run QuickCheck mode in CI - It's fast and catches many issues without AFL setup.

  8. Use AFL for deep testing - AFL's coverage guidance finds edge cases that random testing misses.

Bugs Found by This Approach#

Example bugs discovered through fuzz testing:

  • Empty string encoding - Empty strings weren't properly quoted, causing decode failures
  • Empty array encoding - Empty arrays [] weren't using flow style, breaking roundtrips
  • Large integer precision - Integers > 2^53-1 lost precision in float-based formats
  • Option type decoding - Nested type combinators weren't being unwrapped during scalar decoding
  • Special character handling - Strings with newlines or special YAML characters needed escaping

HTML Parser Bugs (html5rw)#

Fuzz testing html5rw discovered these serialization bugs:

  • Raw text element double-escaping - Elements like <script>, <style>, <xmp>, <iframe>, <noembed>, <noframes>, <noscript> were having their content HTML-escaped during serialization, which broke roundtrips (e.g., < becoming &lt; inside script tags)
  • Escapable raw text handling - Elements <textarea> and <title> need special treatment: only & needs escaping, not < or >
  • Plaintext element serialization - The <plaintext> element cannot be closed in HTML5, so serialization must stop after emitting its content

AFL++ Compatibility Notes#

When using AFL++ (modern fork of AFL) with OCaml:

Crowbar Persistent Mode Issues#

Crowbar's built-in AFL support may not work with AFL++. The @@ file-based mode with Crowbar often results in "No instrumentation detected" errors even when instrumentation is present.

Solution: Create a separate AFL fuzzer using afl-persistent directly:

(* fuzz_afl.ml - Simple AFL-compatible fuzzer *)

let fuzz_input input =
  try
    (* Your fuzzing logic here *)
    let result = MyLib.parse input in
    let serialized = MyLib.serialize result in
    (* Check properties, assert on failures *)
    if not (some_property serialized) then
      assert false;
    true
  with
  | Assert_failure _ -> raise (Assert_failure ("", 0, 0))
  | _ -> true  (* Expected failures for malformed input *)

let read_file filename =
  let ic = open_in_bin filename in
  let n = in_channel_length ic in
  let s = really_input_string ic n in
  close_in ic;
  s

let () =
  AflPersistent.run (fun () ->
    if Array.length Sys.argv < 2 then begin
      Printf.eprintf "Usage: %s <input_file>\n" Sys.argv.(0);
      exit 1
    end;
    let input = read_file Sys.argv.(1) in
    ignore (fuzz_input input)
  )

Add afl-persistent to dependencies in dune:

(executable
 (name fuzz_afl)
 (libraries mylib afl-persistent))

Building with AFL Instrumentation#

Use -x afl flag (not --context afl):

dune build -x afl ./fuzz/fuzz_afl.exe

Environment Variables for AFL++#

# Suppress core_pattern warning (when you can't modify /proc/sys/kernel/core_pattern)
export AFL_I_DONT_CARE_ABOUT_MISSING_CRASHES=1

# Skip CPU frequency scaling check
export AFL_SKIP_CPUFREQ=1

# Run AFL
afl-fuzz -i input_corpus -o output -- _build/afl/fuzz/fuzz_afl.exe @@

Verifying Instrumentation#

Use afl-showmap to verify instrumentation is detected:

afl-showmap -o /tmp/map -- _build/afl/fuzz/fuzz_afl.exe /tmp/test_input
cat /tmp/map | head -20  # Should show tuples like "000062:3"

A healthy instrumentation should capture 200+ tuples for a non-trivial program.

Creating Effective Seed Corpus#

For HTML parsers, include diverse constructs:

mkdir -p fuzz/input_corpus

# Basic structure
echo '<html><head><title>Test</title></head><body><p>Hello</p></body></html>' > fuzz/input_corpus/basic.html

# Raw text elements (critical for escaping bugs)
echo '<script>if (a < b && c > d) {}</script>' > fuzz/input_corpus/script.html
echo '<style>.foo { content: "<bar>" }</style>' > fuzz/input_corpus/style.html

# Escapable raw text elements
echo '<textarea>Text with <html> tags & entities</textarea>' > fuzz/input_corpus/textarea.html

# Tables (complex nesting rules)
echo '<table><tr><td>Cell</td></tr></table>' > fuzz/input_corpus/table.html

# Forms
echo '<form><input type="text"><button>Submit</button></form>' > fuzz/input_corpus/form.html

# SVG/MathML (foreign content)
echo '<svg><circle r="50"/></svg>' > fuzz/input_corpus/svg.html
echo '<math><mrow><mi>x</mi></mrow></math>' > fuzz/input_corpus/math.html

# Entities
echo '<p>&amp; &lt; &gt; &nbsp;</p>' > fuzz/input_corpus/entities.html

# Template elements
echo '<template><div>Content</div></template>' > fuzz/input_corpus/template.html

# Malformed HTML (parser recovery)
echo '<p>Unclosed<div>Mixed</p></div>' > fuzz/input_corpus/malformed.html

# Comments
echo '<!--comment-->Text<!---->More' > fuzz/input_corpus/comment.html

1. Roundtrip Stabilization#

The most effective parser test: parse → serialize → parse → serialize should produce identical output on the second iteration:

let () =
  add_test ~name:"roundtrip_stabilizes" [ bytes ] @@ fun input ->
  let r1 = parse input in
  let s1 = serialize r1 in
  let r2 = parse s1 in
  let s2 = serialize r2 in
  let r3 = parse s2 in
  let s3 = serialize r3 in
  (* First roundtrip may differ, but second must stabilize *)
  check (s2 = s3)

2. Clone Consistency#

For DOM-like structures with clone operations:

let () =
  add_test ~name:"clone_identical" [ bytes ] @@ fun input ->
  let doc = parse input in
  let root = get_root doc in
  let cloned = clone ~deep:true root in
  check (to_html root = to_html cloned)

3. Error Consistency#

Parse errors should be deterministic:

let () =
  add_test ~name:"errors_consistent" [ bytes ] @@ fun input ->
  let r1 = parse input in
  let r2 = parse input in
  check (get_errors r1 = get_errors r2)

Workflow for Fuzz-Driven Bug Fixing#

  1. Run QuickCheck mode first - Fast iteration, catches obvious bugs

    dune exec ./fuzz/fuzz_mylib.exe
    
  2. Fix bugs, re-run - Iterate until QuickCheck passes

  3. Run AFL for deeper coverage - 5-10 minutes initially

    dune build -x afl ./fuzz/fuzz_afl.exe
    AFL_I_DONT_CARE_ABOUT_MISSING_CRASHES=1 AFL_SKIP_CPUFREQ=1 \
      timeout 600 afl-fuzz -i fuzz/input_corpus -o fuzz/output -- \
      _build/afl/fuzz/fuzz_afl.exe @@
    
  4. Check for crashes - Examine fuzz/output/default/crashes/

    ls fuzz/output/default/crashes/
    # Reproduce crash:
    _build/afl/fuzz/fuzz_afl.exe fuzz/output/default/crashes/id:000000,...
    
  5. Fix and repeat - Continue until no crashes found after extended runs

  6. Measure coverage - Track corpus growth and edge coverage

    cat fuzz/output/default/fuzzer_stats | grep -E "(execs_done|corpus_count|saved_crashes)"