OCaml codecs for the Citation File Format (CFF)

cleanup

+58 -285
+3
.gitmodules
··· 1 + [submodule "vendor/git/citation-file-format"] 2 + path = vendor/git/citation-file-format 3 + url = https://github.com/citation-file-format/citation-file-format.git
-1
lib/cff.ml
··· 6 6 (** Citation File Format (CFF) codec for OCaml. *) 7 7 8 8 (* Module aliases *) 9 - module Config = Cff_config 10 9 module Date = Cff_date 11 10 module Country = Cff_country 12 11 module License = Cff_license
+55 -115
lib/cff.mli
··· 10 10 version 1.2.0, a human- and machine-readable format for software and 11 11 dataset citation metadata. 12 12 13 - CFF files are plain text files named [CITATION.cff] written in 14 - {{:https://yaml.org/}YAML 1.2}. They provide citation metadata for 15 - software and datasets, enabling proper academic credit for research 16 - software. 13 + CFF files are named [CITATION.cff] written in the {{:https://yaml.org/}YAML 1.2} 14 + format. They provide citation metadata for software and datasets, enabling 15 + proper academic credit for research software. 17 16 18 17 {1 Overview} 19 18 ··· 34 33 ~title:"My Research Software" 35 34 ~authors:[author] 36 35 ~version:"1.0.0" 37 - ~doi:"10.5281/zenodo.1234567" 36 + ~doi:"10.12345/zenodo.1234567" 38 37 () 39 38 ]} 40 39 41 40 {2 File I/O} 42 41 43 42 For file operations, use the backend-specific subpackages: 44 - - [cff.unix] - Unix file I/O using [In_channel]/[Out_channel] 45 - - [cff.eio] - Eio-based I/O using [bytesrw-eio] 43 + - [cff.unix] - Unix file I/O using {!In_channel}/{!Out_channel} 44 + - [cff.eio] - Eio-based I/O using {!Bytesrw_eio} to serialise to flows 46 45 47 46 Example with [cff.unix]: 48 47 {[ 49 48 match Cff_unix.of_file "CITATION.cff" with 50 - | Ok cff -> Printf.printf "Title: %s\n" (Cff.title cff) 51 - | Error msg -> Printf.eprintf "Error: %s\n" msg 49 + | Ok cff -> Printf.printf "Title: %s\n%!" (Cff.title cff) 50 + | Error msg -> Printf.eprintf "Error: %s\n%!" msg 52 51 ]} 53 52 54 - {1 Module Structure} 55 - 56 - The library uses a flat internal structure ([Cff_author], [Cff_date], etc.) 57 - but exposes a convenient nested API through module aliases: 58 - 59 - - {!module:Author} - Person and entity types for authorship 60 - - {!module:Reference} - Bibliographic reference with 60+ fields 61 - - {!module:Identifier} - DOI, URL, SWH, and other identifiers 62 - - {!module:License} - SPDX license identifiers 63 - - {!module:Date} - ISO 8601 date handling 64 - 65 53 {1 CFF Specification} 66 54 67 55 This implementation follows the 68 56 {{:https://github.com/citation-file-format/citation-file-format}CFF 1.2.0 specification}. 69 - Key concepts: 57 + Key concepts include: 70 58 71 - - {b Authors}: Can be persons (with family/given names) or entities 59 + - {!module:Author}: Can be persons (with family/given names) or entities 72 60 (organizations, identified by a [name] field) 73 - - {b References}: Bibliography entries that the work cites or depends on 74 - - {b Preferred citation}: An alternate work to cite instead of the 75 - software itself (e.g., a journal article about the software) 76 - - {b Identifiers}: Typed identifiers including DOIs, URLs, and 61 + - {!module:Reference}: Bibliography entries that the work cites or depends on 62 + - {!module:Identifier}: Typed identifiers including DOIs, URLs, and 77 63 Software Heritage IDs (SWH) 78 - - {b Licenses}: SPDX license identifiers; multiple licenses imply OR 64 + - {!module:License}: SPDX license identifiers where multiple licenses imply "OR" 79 65 80 66 {1 Core Types} *) 81 67 82 - (** Configuration for validation strictness. *) 83 - module Config = Cff_config 68 + (** The main [t] type represents a complete [CITATION.cff] file with all 69 + required and optional fields from the CFF 1.2.0 specification. 70 + 71 + Every valid CFF file must include: 72 + - {!cff_version}: Schema version (defaults to ["1.2.0"]) 73 + - {!message}: Instructions for citing the work (has sensible default) 74 + - {!title}: Name of the software or dataset 75 + - {!authors}: List of persons and/or entities 76 + 77 + Optional fields are: 78 + - {!version}: Software version string 79 + - {!doi}: Digital Object Identifier 80 + - {!date_released}: Publication/release date 81 + - {!license}: SPDX license identifier(s) 82 + - {!keywords}: Descriptive keywords 83 + - {!abstract}: Description of the work 84 + 85 + The {!preferred_citation} field allows redirecting citations to 86 + a related work (e.g., a journal article describing the software). 87 + The {!section-references} field lists works that the software cites or 88 + depends upon. *) 89 + 90 + (** The abstract type representing a complete CFF document. *) 91 + type t 84 92 85 93 (** Date representation as [(year, month, day)] tuple. 86 94 ··· 99 107 (the user may choose any of the listed licenses). *) 100 108 module License = Cff_license 101 109 102 - (** {1 Enumeration Types} *) 103 - 104 - (** Identifier types for the [identifiers] field. 105 - 106 - - [`Doi] - Digital Object Identifier 107 - - [`Url] - Web URL 108 - - [`Swh] - Software Heritage identifier 109 - - [`Other] - Other identifier type *) 110 - module Identifier_type = Cff_enums.Identifier_type 111 - 112 - (** Reference types for bibliographic entries. 113 - 114 - CFF supports 40+ reference types including [`Article], [`Book], 115 - [`Software], [`Conference_paper], [`Thesis], [`Dataset], and more. 116 - See {!Cff_enums.Reference_type} for the complete list. *) 117 - module Reference_type = Cff_enums.Reference_type 118 - 119 - (** Publication status for works in progress. 120 - 121 - - [`Preprint] - Available as preprint 122 - - [`Submitted] - Submitted for publication 123 - - [`In_press] - Accepted, awaiting publication 124 - - [`Advance_online] - Published online ahead of print *) 125 - module Status = Cff_enums.Status 126 - 127 110 (** CFF file type: [`Software] (default) or [`Dataset]. *) 128 111 module Cff_type = Cff_enums.Cff_type 129 112 130 - (** {1 Address and Contact Information} *) 131 - 132 - (** Physical address with street, city, region, postal code, and country. *) 133 - module Address = Cff_address.Address 134 - 135 - (** Contact information: email, telephone, fax, website, and ORCID. *) 136 - module Contact = Cff_address.Contact 137 - 138 113 (** {1 Authors and Entities} *) 139 114 140 115 (** Authors as a discriminated union of {!Person} or {!Entity}. ··· 146 121 When parsing, the presence of a [name] field indicates an entity; 147 122 otherwise, the entry is treated as a person. *) 148 123 module Author = Cff_author 149 - 150 - (** Person name components: family names, given names, particle, suffix, alias. *) 151 - module Name = Cff_author.Name 152 124 153 125 (** A person (individual author or contributor). *) 154 126 module Person = Cff_author.Person ··· 186 158 - {!Reference.Technical} - Commit, version, format *) 187 159 module Reference = Cff_reference 188 160 189 - (** {1 Root CFF Type} 190 - 191 - The main [t] type represents a complete [CITATION.cff] file with all 192 - required and optional fields from the CFF 1.2.0 specification. 193 - 194 - {2 Required Fields} 195 - 196 - Every valid CFF file must include: 197 - - {!cff_version}: Schema version (defaults to ["1.2.0"]) 198 - - {!message}: Instructions for citing the work (has sensible default) 199 - - {!title}: Name of the software or dataset 200 - - {!authors}: List of persons and/or entities 201 - 202 - {2 Common Optional Fields} 203 - 204 - - {!version}: Software version string 205 - - {!doi}: Digital Object Identifier 206 - - {!date_released}: Publication/release date 207 - - {!license}: SPDX license identifier(s) 208 - - {!keywords}: Descriptive keywords 209 - - {!abstract}: Description of the work 210 - 211 - {2 Citation Redirection} 212 - 213 - The {!preferred_citation} field allows redirecting citations to 214 - a related work (e.g., a journal article describing the software). 215 - The {!references} field lists works that the software cites or 216 - depends upon. *) 217 - 218 - (** The abstract type representing a complete CFF document. *) 219 - type t 220 - 221 161 (** {1 Construction} *) 222 162 223 163 val default_cff_version : string ··· 231 171 ?cff_version:string -> 232 172 ?message:string -> 233 173 title:string -> 234 - authors:Cff_author.t list -> 174 + authors:Author.t list -> 235 175 ?abstract:string -> 236 176 ?commit:string -> 237 - ?contact:Cff_author.t list -> 238 - ?date_released:Cff_date.t -> 177 + ?contact:Author.t list -> 178 + ?date_released:Date.t -> 239 179 ?doi:string -> 240 - ?identifiers:Cff_identifier.t list -> 180 + ?identifiers:Identifier.t list -> 241 181 ?keywords:string list -> 242 - ?license:Cff_license.t -> 182 + ?license:License.t -> 243 183 ?license_url:string -> 244 - ?preferred_citation:Cff_reference.t -> 245 - ?references:Cff_reference.t list -> 184 + ?preferred_citation:Reference.t -> 185 + ?references:Reference.t list -> 246 186 ?repository:string -> 247 187 ?repository_artifact:string -> 248 188 ?repository_code:string -> 249 - ?type_:Cff_enums.Cff_type.t -> 189 + ?type_:Cff_type.t -> 250 190 ?url:string -> 251 191 ?version:string -> 252 192 unit -> t ··· 257 197 @param title The name of the software or dataset 258 198 @param authors List of persons and/or entities who created the work *) 259 199 260 - (** {1 Required Fields} *) 200 + (** {2 Required Fields} *) 261 201 262 202 val cff_version : t -> string 263 203 (** The CFF schema version that this file adheres to. ··· 280 220 This is the title that should appear in citations. For software, it's 281 221 typically the project name; for datasets, the dataset title. *) 282 222 283 - val authors : t -> Cff_author.t list 223 + val authors : t -> Author.t list 284 224 (** The creators of the software or dataset. 285 225 286 226 Authors can be persons (individuals) or entities (organizations). 287 227 At least one author is required for a valid CFF file. The order 288 228 typically reflects contribution significance. *) 289 229 290 - (** {1 Optional Fields} *) 230 + (** {2 Optional Fields} *) 291 231 292 232 val abstract : t -> string option 293 233 (** A description of the software or dataset. ··· 300 240 Useful for precise version identification beyond semantic versioning. 301 241 Example: ["1ff847d81f29c45a3a1a5ce73d38e45c2f319bba"] *) 302 242 303 - val contact : t -> Cff_author.t list option 243 + val contact : t -> Author.t list option 304 244 (** Contact persons or entities for the software or dataset. 305 245 306 246 May differ from authors; useful when the primary contact is a 307 247 project maintainer rather than the original author. *) 308 248 309 - val date_released : t -> Cff_date.t option 249 + val date_released : t -> Date.t option 310 250 (** The date when the software or dataset was released. 311 251 312 252 Format is [(year, month, day)], corresponding to ISO 8601 [YYYY-MM-DD]. *) ··· 318 258 for a single DOI; use {!identifiers} for multiple DOIs or other 319 259 identifier types. Example: ["10.5281/zenodo.1234567"] *) 320 260 321 - val identifiers : t -> Cff_identifier.t list option 261 + val identifiers : t -> Identifier.t list option 322 262 (** Additional identifiers beyond the primary DOI. 323 263 324 264 Each identifier has a type (DOI, URL, SWH, other), value, and ··· 331 271 Help with discoverability and categorization. Example: 332 272 [["machine learning"; "image processing"; "python"]] *) 333 273 334 - val license : t -> Cff_license.t option 274 + val license : t -> License.t option 335 275 (** The SPDX license identifier(s) for the work. 336 276 337 277 Uses {{:https://spdx.org/licenses/}SPDX identifiers}. Multiple ··· 344 284 Only needed for licenses not in the SPDX list. Standard SPDX 345 285 licenses have well-known URLs. *) 346 286 347 - val preferred_citation : t -> Cff_reference.t option 287 + val preferred_citation : t -> Reference.t option 348 288 (** A reference to cite instead of the software itself. 349 289 350 290 Used for "credit redirection" when authors prefer citation of ··· 352 292 Note: Software citation principles recommend citing software 353 293 directly; use this field judiciously. *) 354 294 355 - val references : t -> Cff_reference.t list option 295 + val references : t -> Reference.t list option 356 296 (** Works that this software cites or depends upon. 357 297 358 298 Functions like a bibliography, listing dependencies, foundational ··· 377 317 Typically a GitHub, GitLab, or similar URL where the source 378 318 code is publicly accessible. *) 379 319 380 - val type_ : t -> Cff_enums.Cff_type.t option 320 + val type_ : t -> Cff_type.t option 381 321 (** The type of work: [`Software] (default) or [`Dataset]. 382 322 383 323 Most CFF files describe software; use [`Dataset] for data packages. *)
-60
lib/cff_config.ml
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 The ocaml-cff programmers. All rights reserved. 3 - SPDX-License-Identifier: ISC 4 - ---------------------------------------------------------------------------*) 5 - 6 - (** Configuration for CFF parsing and validation. *) 7 - 8 - type t = { 9 - strict_urls : bool; 10 - strict_dates : bool; 11 - strict_dois : bool; 12 - strict_orcids : bool; 13 - strict_licenses : bool; 14 - keep_unknown : bool; 15 - } 16 - 17 - let default = { 18 - strict_urls = false; 19 - strict_dates = false; 20 - strict_dois = false; 21 - strict_orcids = false; 22 - strict_licenses = false; 23 - keep_unknown = true; 24 - } 25 - 26 - let strict = { 27 - strict_urls = true; 28 - strict_dates = true; 29 - strict_dois = true; 30 - strict_orcids = true; 31 - strict_licenses = true; 32 - keep_unknown = true; 33 - } 34 - 35 - let lenient = { 36 - strict_urls = false; 37 - strict_dates = false; 38 - strict_dois = false; 39 - strict_orcids = false; 40 - strict_licenses = false; 41 - keep_unknown = true; 42 - } 43 - 44 - let make 45 - ?(strict_urls = false) 46 - ?(strict_dates = false) 47 - ?(strict_dois = false) 48 - ?(strict_orcids = false) 49 - ?(strict_licenses = false) 50 - ?(keep_unknown = true) 51 - () = 52 - { strict_urls; strict_dates; strict_dois; strict_orcids; 53 - strict_licenses; keep_unknown } 54 - 55 - let strict_urls t = t.strict_urls 56 - let strict_dates t = t.strict_dates 57 - let strict_dois t = t.strict_dois 58 - let strict_orcids t = t.strict_orcids 59 - let strict_licenses t = t.strict_licenses 60 - let keep_unknown t = t.keep_unknown
-109
lib/cff_config.mli
··· 1 - (*--------------------------------------------------------------------------- 2 - Copyright (c) 2025 The ocaml-cff programmers. All rights reserved. 3 - SPDX-License-Identifier: ISC 4 - ---------------------------------------------------------------------------*) 5 - 6 - (** Configuration for CFF parsing and validation. 7 - 8 - CFF files in the wild may contain non-standard or deprecated values. 9 - This module provides configuration options to control validation 10 - strictness during parsing. 11 - 12 - {1 Validation Modes} 13 - 14 - {2 Strict Mode} 15 - 16 - Validates all fields according to their specifications: 17 - 18 - - URLs must be well-formed 19 - - Dates must be valid ISO 8601 dates 20 - - DOIs must match the DOI pattern 21 - - ORCIDs must be valid ORCID URLs 22 - - License IDs must be valid SPDX identifiers 23 - 24 - Use strict mode for validating CFF files or when you control the input. 25 - 26 - {2 Lenient Mode} 27 - 28 - Accepts any string value without validation. Use lenient mode when: 29 - 30 - - Parsing CFF files from unknown sources 31 - - Handling legacy files with deprecated license IDs 32 - - Round-tripping files without data loss 33 - 34 - {2 Default Mode} 35 - 36 - A balanced approach that: 37 - - Keeps unknown fields (for round-tripping) 38 - - Uses lenient validation for most fields 39 - 40 - {1 Unknown Fields} 41 - 42 - The [keep_unknown] option controls handling of unrecognized fields: 43 - 44 - - [true]: Preserve unknown fields in the parsed structure 45 - - [false]: Silently ignore unknown fields 46 - 47 - Keeping unknown fields allows round-tripping CFF files that contain 48 - extensions or newer fields not yet supported by this library. *) 49 - 50 - type t 51 - (** Configuration type. *) 52 - 53 - val default : t 54 - (** Default configuration. 55 - 56 - Uses lenient validation and keeps unknown fields. Suitable for 57 - general parsing where round-tripping is desired. *) 58 - 59 - val strict : t 60 - (** Strict configuration. 61 - 62 - Validates all fields according to CFF 1.2.0 specification. 63 - Fails on invalid URLs, dates, DOIs, ORCIDs, and license IDs. 64 - 65 - Keeps unknown fields for compatibility. *) 66 - 67 - val lenient : t 68 - (** Fully lenient configuration. 69 - 70 - Accepts any string values without validation. Useful for parsing 71 - malformed or non-standard CFF files. *) 72 - 73 - val make : 74 - ?strict_urls:bool -> 75 - ?strict_dates:bool -> 76 - ?strict_dois:bool -> 77 - ?strict_orcids:bool -> 78 - ?strict_licenses:bool -> 79 - ?keep_unknown:bool -> 80 - unit -> t 81 - (** Create a custom configuration. 82 - 83 - All strictness options default to [false] (lenient). 84 - [keep_unknown] defaults to [true]. 85 - 86 - @param strict_urls Validate URL format 87 - @param strict_dates Validate date format and values 88 - @param strict_dois Validate DOI pattern 89 - @param strict_orcids Validate ORCID format 90 - @param strict_licenses Validate SPDX license identifiers 91 - @param keep_unknown Preserve unrecognized fields *) 92 - 93 - val strict_urls : t -> bool 94 - (** Whether URL fields are validated. *) 95 - 96 - val strict_dates : t -> bool 97 - (** Whether date fields are validated. *) 98 - 99 - val strict_dois : t -> bool 100 - (** Whether DOI fields are validated. *) 101 - 102 - val strict_orcids : t -> bool 103 - (** Whether ORCID fields are validated. *) 104 - 105 - val strict_licenses : t -> bool 106 - (** Whether license identifiers are validated against SPDX. *) 107 - 108 - val keep_unknown : t -> bool 109 - (** Whether unknown fields are preserved in the parsed structure. *)