OCaml HTML5 parser/serialiser based on Python's JustHTML

Merge commit 'dba833a51df84e632618ce7d4b3370bee440cc7a' as 'odoc-xo'

+471
+17
odoc-xo/.gitignore
··· 1 + # OCaml build artifacts 2 + _build/ 3 + *.install 4 + *.merlin 5 + 6 + # Third-party sources (fetch locally with opam source) 7 + third_party/ 8 + 9 + # Editor and OS files 10 + .DS_Store 11 + *.swp 12 + *~ 13 + .vscode/ 14 + .idea/ 15 + 16 + # Opam local switch 17 + _opam/
+1
odoc-xo/.ocamlformat
··· 1 + version=0.28.1
+53
odoc-xo/.tangled/workflows/build.yml
··· 1 + when: 2 + - event: ["push", "pull_request"] 3 + branch: ["main"] 4 + 5 + engine: nixery 6 + 7 + dependencies: 8 + nixpkgs: 9 + - shell 10 + - stdenv 11 + - findutils 12 + - binutils 13 + - libunwind 14 + - ncurses 15 + - opam 16 + - git 17 + - gawk 18 + - gnupatch 19 + - gnum4 20 + - gnumake 21 + - gnutar 22 + - gnused 23 + - gnugrep 24 + - diffutils 25 + - gzip 26 + - bzip2 27 + - gcc 28 + - ocaml 29 + - pkg-config 30 + 31 + steps: 32 + - name: opam 33 + command: | 34 + opam init --disable-sandboxing -a -y 35 + - name: repo 36 + command: | 37 + opam repo add aoah https://tangled.org/anil.recoil.org/aoah-opam-repo.git 38 + - name: switch 39 + command: | 40 + opam install . --confirm-level=unsafe-yes --deps-only 41 + - name: build 42 + command: | 43 + opam exec -- dune build 44 + - name: switch-test 45 + command: | 46 + opam install . --confirm-level=unsafe-yes --deps-only --with-test 47 + - name: test 48 + command: | 49 + opam exec -- dune runtest --verbose 50 + - name: doc 51 + command: | 52 + opam install -y odoc 53 + opam exec -- dune build @doc
+15
odoc-xo/LICENSE.md
··· 1 + ISC License 2 + 3 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org> 4 + 5 + Permission to use, copy, modify, and distribute this software for any 6 + purpose with or without fee is hereby granted, provided that the above 7 + copyright notice and this permission notice appear in all copies. 8 + 9 + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+41
odoc-xo/README.md
··· 1 + # odoc-xo - Transform odoc HTML to x-ocaml Web Components 2 + 3 + A command-line tool that transforms HTML documents generated by odoc to use 4 + interactive x-ocaml web components for OCaml code blocks. 5 + 6 + ## Key Features 7 + 8 + - Transforms `<pre class="language-ocaml">` blocks to `<x-ocaml>` elements 9 + - Processes MDX-style code blocks (lines starting with `# `) 10 + - Injects script and style tags for x-ocaml web components 11 + - Configurable script paths for the x-ocaml runtime 12 + 13 + ## Usage 14 + 15 + Transform a file and write to stdout: 16 + 17 + ``` 18 + odoc-xo tutorial.html 19 + ``` 20 + 21 + Transform and write to a file: 22 + 23 + ``` 24 + odoc-xo -o output.html input.html 25 + ``` 26 + 27 + Use custom script paths: 28 + 29 + ``` 30 + odoc-xo --js-src /assets/x-ocaml.js input.html 31 + ``` 32 + 33 + ## Installation 34 + 35 + ``` 36 + opam install odoc-xo 37 + ``` 38 + 39 + ## License 40 + 41 + ISC
+10
odoc-xo/dune
··· 1 + ; Root dune file 2 + 3 + ; Ignore third_party directory (for fetched dependency sources) 4 + 5 + (data_only_dirs third_party ocaml-html5rw) 6 + 7 + (executable 8 + (name odoc_xo) 9 + (public_name odoc-xo) 10 + (libraries html5rw eio_main cmdliner bytesrw-eio))
+29
odoc-xo/dune-project
··· 1 + (lang dune 3.20) 2 + 3 + (name odoc-xo) 4 + 5 + (generate_opam_files true) 6 + 7 + (license ISC) 8 + (authors "Anil Madhavapeddy") 9 + (homepage "https://tangled.org/@anil.recoil.org/odoc-xo") 10 + (maintainers "Anil Madhavapeddy <anil@recoil.org>") 11 + (bug_reports "https://tangled.org/@anil.recoil.org/odoc-xo/issues") 12 + (maintenance_intent "(latest)") 13 + 14 + (package 15 + (name odoc-xo) 16 + (synopsis "Transform odoc HTML to use x-ocaml web components") 17 + (description 18 + "A command-line tool that transforms HTML documents generated by odoc \ 19 + to use interactive x-ocaml web components for OCaml code blocks. \ 20 + It finds all <pre class=\"language-ocaml\"> blocks and replaces them \ 21 + with <x-ocaml> elements, injecting necessary scripts and styles.") 22 + (depends 23 + (ocaml (>= 5.1.0)) 24 + (cmdliner (>= 1.2.0)) 25 + (eio (>= 1.0)) 26 + (eio_main (>= 1.0)) 27 + (html5rw (>= 0.1.0)) 28 + (bytesrw-eio (>= 0.1.0)) 29 + (odoc :with-doc)))
+1
odoc-xo/ocaml-html5rw
··· 1 + ../ocaml-html5rw
+35
odoc-xo/odoc-xo.opam
··· 1 + # This file is generated by dune, edit dune-project instead 2 + opam-version: "2.0" 3 + synopsis: "Transform odoc HTML to use x-ocaml web components" 4 + description: 5 + "A command-line tool that transforms HTML documents generated by odoc to use interactive x-ocaml web components for OCaml code blocks. It finds all <pre class=\"language-ocaml\"> blocks and replaces them with <x-ocaml> elements, injecting necessary scripts and styles." 6 + maintainer: ["Anil Madhavapeddy <anil@recoil.org>"] 7 + authors: ["Anil Madhavapeddy"] 8 + license: "ISC" 9 + homepage: "https://tangled.org/@anil.recoil.org/odoc-xo" 10 + bug-reports: "https://tangled.org/@anil.recoil.org/odoc-xo/issues" 11 + depends: [ 12 + "dune" {>= "3.20"} 13 + "ocaml" {>= "5.1.0"} 14 + "cmdliner" {>= "1.2.0"} 15 + "eio" {>= "1.0"} 16 + "eio_main" {>= "1.0"} 17 + "html5rw" {>= "0.1.0"} 18 + "bytesrw-eio" {>= "0.1.0"} 19 + "odoc" {with-doc} 20 + ] 21 + build: [ 22 + ["dune" "subst"] {dev} 23 + [ 24 + "dune" 25 + "build" 26 + "-p" 27 + name 28 + "-j" 29 + jobs 30 + "@install" 31 + "@runtest" {with-test} 32 + "@doc" {with-doc} 33 + ] 34 + ] 35 + x-maintenance-intent: ["(latest)"]
+269
odoc-xo/odoc_xo.ml
··· 1 + (*--------------------------------------------------------------------------- 2 + Copyright (c) 2025 Anil Madhavapeddy <anil@recoil.org>. All rights reserved. 3 + SPDX-License-Identifier: ISC 4 + ---------------------------------------------------------------------------*) 5 + 6 + (** odoc-xo: Transform odoc HTML to use x-ocaml web components *) 7 + 8 + open Cmdliner 9 + 10 + (** Configuration for the transformation *) 11 + type config = { 12 + js_src : string; 13 + worker_src : string; 14 + libs_src : string; 15 + } 16 + 17 + let default_config = { 18 + js_src = "x-ocaml.js"; 19 + worker_src = "x-ocaml.worker.js"; 20 + libs_src = "libs.js"; 21 + } 22 + 23 + (** CSS styles to inject *) 24 + let css_styles = {| 25 + body { 26 + font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, sans-serif; 27 + max-width: 900px; 28 + margin: 0 auto; 29 + padding: 20px; 30 + line-height: 1.6; 31 + } 32 + h1, h2, h3 { color: #333; } 33 + code { background: #f4f4f4; padding: 2px 6px; border-radius: 3px; font-family: "SF Mono", Consolas, monospace; } 34 + pre { background: #f4f4f4; padding: 15px; border-radius: 5px; overflow-x: auto; } 35 + x-ocaml { display: block; margin: 15px 0; } 36 + a { color: #0066cc; } 37 + blockquote { border-left: 4px solid #ddd; margin-left: 0; padding-left: 20px; color: #666; font-style: italic; } 38 + .x-ocaml-wrapper { max-width: 900px; font-size: 85%; } 39 + @media only screen and (max-width: 210ex) { 40 + div.editor { 41 + max-width: 50vw; 42 + } 43 + } 44 + @media only screen and (max-width: 110ex) { 45 + div.editor { 46 + max-width: 80vw; 47 + } 48 + .x-ocaml-wrapper { max-width: 600px; } 49 + } 50 + |} 51 + 52 + (** Process MDX-style code blocks to extract just the OCaml phrases. 53 + MDX blocks have lines starting with '# ' for input phrases. 54 + A phrase continues until ';;' or end of block. 55 + Lines after ';;' until the next '#' are expected outputs and should be skipped. *) 56 + let process_mdx_content text = 57 + let lines = String.split_on_char '\n' text in 58 + let ends_with_double_semi s = 59 + let s = String.trim s in 60 + let len = String.length s in 61 + len >= 2 && String.sub s (len - 2) 2 = ";;" 62 + in 63 + (* State: `Outside means waiting for #, `Inside means collecting phrase *) 64 + let rec process acc current_phrase = function 65 + | [] -> 66 + (* End of block - flush any pending phrase *) 67 + let acc = match current_phrase with 68 + | [] -> acc 69 + | lines -> (String.concat "\n" (List.rev lines)) :: acc 70 + in 71 + List.rev acc 72 + | line :: rest -> 73 + let trimmed = String.trim line in 74 + match current_phrase with 75 + | [] -> 76 + (* Outside a phrase - look for # to start *) 77 + if String.length trimmed >= 2 && String.sub trimmed 0 2 = "# " then 78 + (* Start of phrase - strip "# " prefix *) 79 + let phrase_start = String.sub line (String.index line '#' + 2) 80 + (String.length line - String.index line '#' - 2) in 81 + if ends_with_double_semi phrase_start then 82 + (* Single-line phrase, complete *) 83 + process (phrase_start :: acc) [] rest 84 + else 85 + (* Multi-line phrase, continue collecting *) 86 + process acc [phrase_start] rest 87 + else if String.length trimmed = 1 && trimmed.[0] = '#' then 88 + (* Just "#" alone - skip *) 89 + process acc [] rest 90 + else 91 + (* Output line - skip *) 92 + process acc [] rest 93 + | _ -> 94 + (* Inside a phrase - collect until ;; *) 95 + let current_phrase = line :: current_phrase in 96 + if ends_with_double_semi line then 97 + (* End of phrase *) 98 + let phrase = String.concat "\n" (List.rev current_phrase) in 99 + process (phrase :: acc) [] rest 100 + else 101 + (* Continue collecting *) 102 + process acc current_phrase rest 103 + in 104 + let phrases = process [] [] lines in 105 + String.concat "\n" phrases 106 + 107 + (** Check if content looks like MDX format (has lines starting with #) *) 108 + let is_mdx_content text = 109 + let lines = String.split_on_char '\n' text in 110 + List.exists (fun line -> 111 + let trimmed = String.trim line in 112 + String.length trimmed >= 1 && String.sub trimmed 0 1 = "#" 113 + ) lines 114 + 115 + (** Replace a node with another in its parent *) 116 + let replace_node ~old_node ~new_node = 117 + match old_node.Html5rw.Dom.parent with 118 + | None -> failwith "Cannot replace node without parent" 119 + | Some parent -> 120 + Html5rw.insert_before parent new_node old_node; 121 + Html5rw.remove_child parent old_node 122 + 123 + (** Inject script and style elements into the head *) 124 + let inject_head_elements config doc = 125 + match Html5rw.query doc "head" with 126 + | [] -> 127 + Printf.eprintf "Warning: No <head> element found, skipping injection\n%!" 128 + | head :: _ -> 129 + (* Create the script element *) 130 + let script = Html5rw.create_element "script" 131 + ~attrs:[ 132 + ("async", ""); 133 + ("src", config.js_src); 134 + ("src-worker", config.worker_src); 135 + ("src-load", config.libs_src); 136 + ("crossorigin", "anonymous"); 137 + ] () in 138 + 139 + (* Create the style element *) 140 + let style = Html5rw.create_element "style" () in 141 + Html5rw.append_child style (Html5rw.create_text css_styles); 142 + 143 + (* Append to head *) 144 + Html5rw.append_child head script; 145 + Html5rw.append_child head style 146 + 147 + (** Transform a single <pre class="language-ocaml"><code>...</code></pre> block *) 148 + let transform_code_block pre_node = 149 + (* Find the <code> child *) 150 + let code_children = List.filter (fun child -> 151 + Html5rw.is_element child && child.Html5rw.Dom.name = "code" 152 + ) pre_node.Html5rw.Dom.children in 153 + 154 + let raw_text = match code_children with 155 + | [] -> Html5rw.get_text_content pre_node 156 + | code :: _ -> Html5rw.get_text_content code 157 + in 158 + (* Process MDX content if it looks like MDX format *) 159 + let text_content = 160 + if is_mdx_content raw_text then process_mdx_content raw_text 161 + else raw_text 162 + in 163 + let xocaml = Html5rw.create_element "x-ocaml" () in 164 + Html5rw.append_child xocaml (Html5rw.create_text text_content); 165 + (* Wrap in a div with max-width constraint *) 166 + let wrapper = Html5rw.create_element "div" ~attrs:[("class", "x-ocaml-wrapper")] () in 167 + Html5rw.append_child wrapper xocaml; 168 + replace_node ~old_node:pre_node ~new_node:wrapper 169 + 170 + (** Transform all OCaml code blocks in the document *) 171 + let transform_code_blocks doc = 172 + (* Find all <pre class="language-ocaml"> elements *) 173 + let pre_nodes = Html5rw.query doc "pre.language-ocaml" in 174 + 175 + if pre_nodes = [] then 176 + Printf.eprintf "Info: No <pre class=\"language-ocaml\"> blocks found\n%!"; 177 + 178 + (* Transform each one - iterate in reverse to avoid index issues *) 179 + List.iter transform_code_block (List.rev pre_nodes) 180 + 181 + (** Main transformation function *) 182 + let transform config doc = 183 + inject_head_elements config doc; 184 + transform_code_blocks doc; 185 + doc 186 + 187 + (** Run the transformation *) 188 + let run config ~input ~output = 189 + Eio_main.run @@ fun env -> 190 + let cwd = Eio.Stdenv.cwd env in 191 + 192 + (* Read input file *) 193 + let input_path = Eio.Path.(cwd / input) in 194 + let html_content = Eio.Path.load input_path in 195 + 196 + (* Parse HTML *) 197 + let doc = Html5rw.parse (Bytesrw.Bytes.Reader.of_string html_content) in 198 + 199 + (* Transform *) 200 + let transformed = transform config doc in 201 + 202 + (* Write output using bytesrw-eio *) 203 + match output with 204 + | None -> 205 + (* Write to stdout *) 206 + let writer = Bytesrw_eio.bytes_writer_of_flow (Eio.Stdenv.stdout env) in 207 + Html5rw.to_writer ~pretty:false transformed writer; 208 + Bytesrw.Bytes.Writer.write_eod writer 209 + | Some output_file -> 210 + let output_path = Eio.Path.(cwd / output_file) in 211 + Eio.Path.with_open_out ~create:(`Or_truncate 0o644) output_path @@ fun flow -> 212 + let writer = Bytesrw_eio.bytes_writer_of_flow flow in 213 + Html5rw.to_writer ~pretty:false transformed writer; 214 + Bytesrw.Bytes.Writer.write_eod writer 215 + 216 + (* CLI argument definitions *) 217 + 218 + let input_arg = 219 + let doc = "Input HTML file to transform." in 220 + Arg.(required & pos 0 (some string) None & info [] ~docv:"INPUT" ~doc) 221 + 222 + let output_arg = 223 + let doc = "Output file. If not specified, writes to stdout." in 224 + Arg.(value & opt (some string) None & info ["o"; "output"] ~docv:"FILE" ~doc) 225 + 226 + let js_src_arg = 227 + let doc = "Path to x-ocaml.js script." in 228 + Arg.(value & opt string default_config.js_src & info ["js-src"] ~docv:"URL" ~doc) 229 + 230 + let worker_src_arg = 231 + let doc = "Path to x-ocaml.worker.js script." in 232 + Arg.(value & opt string default_config.worker_src & info ["worker-src"] ~docv:"URL" ~doc) 233 + 234 + let libs_src_arg = 235 + let doc = "Path to libs.js script." in 236 + Arg.(value & opt string default_config.libs_src & info ["libs-src"] ~docv:"URL" ~doc) 237 + 238 + let cmd_run input output js_src worker_src libs_src = 239 + let config = { js_src; worker_src; libs_src } in 240 + try 241 + run config ~input ~output; 242 + `Ok () 243 + with 244 + | Eio.Io (Eio.Fs.E (Eio.Fs.Not_found _), _) -> 245 + `Error (false, Printf.sprintf "File not found: %s" input) 246 + | exn -> 247 + `Error (false, Printf.sprintf "Error: %s" (Printexc.to_string exn)) 248 + 249 + let cmd = 250 + let doc = "Transform odoc HTML to use x-ocaml web components" in 251 + let man = [ 252 + `S Manpage.s_description; 253 + `P "$(tname) transforms HTML documents generated by odoc to use interactive \ 254 + x-ocaml web components for OCaml code blocks."; 255 + `P "The tool finds all $(b,<pre class=\"language-ocaml\"><code>...</code></pre>) \ 256 + blocks and replaces them with $(b,<x-ocaml>...</x-ocaml>) elements."; 257 + `P "It also injects the necessary script and style tags into the document head."; 258 + `S Manpage.s_examples; 259 + `P "Transform a file and write to stdout:"; 260 + `Pre " $(tname) tutorial.html"; 261 + `P "Transform and write to a file:"; 262 + `Pre " $(tname) -o output.html input.html"; 263 + `P "Use custom script paths:"; 264 + `Pre " $(tname) --js-src /assets/x-ocaml.js input.html"; 265 + ] in 266 + let info = Cmd.info "odoc-xo" ~version:"0.1.0" ~doc ~man in 267 + Cmd.v info Term.(ret (const cmd_run $ input_arg $ output_arg $ js_src_arg $ worker_src_arg $ libs_src_arg)) 268 + 269 + let () = exit (Cmd.eval cmd)