OCaml codecs for the Citation File Format (CFF)
at main 348 lines 12 kB view raw
1(*--------------------------------------------------------------------------- 2 Copyright (c) 2026 The ocaml-cff programmers. All rights reserved. 3 SPDX-License-Identifier: ISC 4 ---------------------------------------------------------------------------*) 5 6(** Citation File Format (CFF) codec for OCaml. 7 8 This library provides codecs for the 9 {{:https://citation-file-format.github.io/}Citation File Format (CFF)} 10 version 1.2.0, a human- and machine-readable format for software and 11 dataset citation metadata. 12 13 CFF files are named [CITATION.cff] written in the {{:https://yaml.org/}YAML 1.2} 14 format. They provide citation metadata for software and datasets, enabling 15proper academic credit for research software. 16 17 {1 Overview} 18 19 A minimal [CITATION.cff] file requires four fields: 20 - [cff-version]: The CFF schema version (currently ["1.2.0"]) 21 - [message]: Instructions for citing the work 22 - [title]: The name of the software or dataset 23 - [authors]: A list of persons and/or entities 24 25 {2 Creating a CFF record} 26 27 {[ 28 let author = Cff.Author.person 29 ~family_names:"Smith" ~given_names:"Jane" () in 30 let cff = Cff.make 31 ~title:"My Research Software" 32 ~authors:[author] 33 ~version:"1.0.0" 34 ~doi:"10.12345/zenodo.1234567" 35 () 36 ]} 37 38 {2 I/O} 39 40 For file operations, use the backend-specific subpackages: 41 - [cff.unix] - Unix file I/O using {!In_channel}/{!Out_channel} 42 - [cff.eio] - Eio-based I/O using [Bytesrw_eio] to serialise to flows 43 44 Example with [cff.unix]: 45 {[ 46 match Cff_unix.of_file "CITATION.cff" with 47 | Ok cff -> Printf.printf "Title: %s\n%!" (Cff.title cff) 48 | Error msg -> Printf.eprintf "Error: %s\n%!" msg 49 ]} 50 51 {1 CFF Specification} 52 53 This implementation follows the 54 {{:https://github.com/citation-file-format/citation-file-format}CFF 1.2.0 specification}. 55 Useful modules include: 56 57 - {!module:Author}: Can be persons (with family/given names) or entities 58 (organizations, identified by a [name] field) 59 - {!module:Reference}: Bibliography entries that the work cites or depends on 60 - {!module:Identifier}: Typed identifiers including DOIs, URLs, and 61 Software Heritage IDs (SWH) 62 - {!module:License}: SPDX license identifiers where multiple licenses imply "OR" 63 64 {1 Core Types} *) 65 66(** The main [t] type represents a complete [CITATION.cff] file with all 67 required and optional fields from the CFF 1.2.0 specification. 68 69 Every valid CFF file must include: 70 - {!cff_version}: Schema version (defaults to ["1.2.0"]) 71 - {!message}: Instructions for citing the work (has sensible default) 72 - {!title}: Name of the software or dataset 73 - {!authors}: List of persons and/or entities 74 75 Optional fields are: 76 - {!version}: Software version string 77 - {!doi}: Digital Object Identifier 78 - {!date_released}: Publication/release date 79 - {!license}: SPDX license identifier(s) 80 - {!keywords}: Descriptive keywords 81 - {!abstract}: Description of the work 82 83 The {!preferred_citation} field allows redirecting citations to 84 a related work (e.g., a journal article describing the software). 85 The {!section-references} field lists works that the software cites or 86 depends upon. *) 87 88(** The abstract type representing a complete CFF document. *) 89type t 90 91(** Date representation as [(year, month, day)] tuple. 92 93 CFF uses ISO 8601 dates in [YYYY-MM-DD] format (e.g., ["2024-01-15"]). *) 94module Date = Cff_date 95 96(** ISO 3166-1 alpha-2 country codes (e.g., ["US"], ["DE"], ["GB"]). 97 98 Used for author and entity addresses. *) 99module Country = Cff_country 100 101(** Physical address information. 102 103 Address fields used for persons and entities: street address, city, 104 region (state/province), postal code, and country code. *) 105module Address = Cff_address.Address 106 107(** Contact information. 108 109 Contact fields used for persons and entities: email, telephone, fax, 110 website URL, and ORCID identifier. *) 111module Contact = Cff_address.Contact 112 113(** SPDX license identifiers. 114 115 CFF uses {{:https://spdx.org/licenses/}SPDX license identifiers} for 116 the [license] field. Multiple licenses indicate an OR relationship 117 (the user may choose any of the listed licenses). *) 118module License = Cff_license 119 120(** CFF file type: [`Software] (default) or [`Dataset]. *) 121module Cff_type = Cff_enums.Cff_type 122 123(** {1 Authors and Entities} *) 124 125(** Authors as a discriminated union of {!Person} or {!Entity}. 126 127 CFF distinguishes between: 128 - {b Persons}: Individual humans with family names, given names, etc. 129 - {b Entities}: Organizations, projects, or groups with a [name] field 130 131 When parsing, the presence of a [name] field indicates an entity; 132 otherwise, the entry is treated as a person. *) 133module Author = Cff_author 134 135(** A person (individual author or contributor). *) 136module Person = Cff_author.Person 137 138(** An entity (organization, institution, project, conference). *) 139module Entity = Cff_author.Entity 140 141(** {1 Identifiers and References} *) 142 143(** Typed identifiers for DOI, URL, SWH, or other schemes. 144 145 Each identifier has a type, value, and optional description. Example: 146 {[ 147 let id = Cff.Identifier.make 148 ~type_:`Doi 149 ~value:"10.5281/zenodo.1234567" 150 ~description:"The concept DOI for all versions" 151 () 152 ]} *) 153module Identifier = Cff_identifier 154 155(** Bibliographic references with comprehensive metadata. 156 157 References can represent any citable work: articles, books, software, 158 datasets, conference papers, theses, etc. The {!Reference} module 159 provides 60+ fields organized into logical sub-records: 160 161 - {!Reference.Core} - Type, title, authors, abstract 162 - {!Reference.Publication} - Journal, volume, issue, pages 163 - {!Reference.Collection} - Proceedings, book series 164 - {!Reference.Dates} - Various date fields and year 165 - {!Reference.Identifiers} - DOI, URL, ISBN, ISSN, etc. 166 - {!Reference.Entities} - Editors, publisher, institution 167 - {!Reference.Metadata} - Keywords, license, notes 168 - {!Reference.Technical} - Commit, version, format *) 169module Reference = Cff_reference 170 171(** {1 Construction} *) 172 173(** The default CFF version used when not specified: ["1.2.0"]. *) 174val default_cff_version : string 175 176(** The default citation message: 177 ["If you use this software, please cite it using the metadata from this file."] *) 178val default_message : string 179 180(** [make ~title ~authors ...] constructs a CFF value. 181 182 @param cff_version The CFF schema version (default: {!default_cff_version}) 183 @param message Instructions for users on how to cite (default: {!default_message}) 184 @param title The name of the software or dataset 185 @param authors List of persons and/or entities who created the work *) 186val make 187 : ?cff_version:string 188 -> ?message:string 189 -> title:string 190 -> authors:Author.t list 191 -> ?abstract:string 192 -> ?commit:string 193 -> ?contact:Author.t list 194 -> ?date_released:Date.t 195 -> ?doi:string 196 -> ?identifiers:Identifier.t list 197 -> ?keywords:string list 198 -> ?license:License.t 199 -> ?preferred_citation:Reference.t 200 -> ?references:Reference.t list 201 -> ?repository:string 202 -> ?repository_artifact:string 203 -> ?repository_code:string 204 -> ?type_:Cff_type.t 205 -> ?url:string 206 -> ?version:string 207 -> unit 208 -> t 209 210(** {2 Required Fields} *) 211 212(** The CFF schema version that this file adheres to. 213 214 For CFF 1.2.0 files, this should be ["1.2.0"]. The version determines 215 which keys are valid and how they should be interpreted. *) 216val cff_version : t -> string 217 218(** A message to readers explaining how to cite the work. 219 220 Common examples: 221 - ["If you use this software, please cite it using the metadata from this file."] 222 - ["Please cite this software using the metadata from 'preferred-citation'."] 223 224 The message should guide users toward the preferred citation method. *) 225val message : t -> string 226 227(** The name of the software or dataset. 228 229 This is the title that should appear in citations. For software, it's 230 typically the project name; for datasets, the dataset title. *) 231val title : t -> string 232 233(** The creators of the software or dataset. 234 235 Authors can be persons (individuals) or entities (organizations). 236 At least one author is required for a valid CFF file. The order 237 typically reflects contribution significance. *) 238val authors : t -> Author.t list 239 240(** {2 Optional Fields} *) 241 242(** A description of the software or dataset. 243 244 Provides context about what the work does, its purpose, and scope. *) 245val abstract : t -> string option 246 247(** The commit hash or revision number of the software version. 248 249 Useful for precise version identification beyond semantic versioning. 250 Example: ["1ff847d81f29c45a3a1a5ce73d38e45c2f319bba"] *) 251val commit : t -> string option 252 253(** Contact persons or entities for the software or dataset. 254 255 May differ from authors; useful when the primary contact is a 256 project maintainer rather than the original author. *) 257val contact : t -> Author.t list option 258 259(** The date when the software or dataset was released. 260 261 Format is [(year, month, day)], corresponding to ISO 8601 [YYYY-MM-DD]. *) 262val date_released : t -> Date.t option 263 264(** The Digital Object Identifier for the software or dataset. 265 266 DOIs provide persistent, citable identifiers. This is a shorthand 267 for a single DOI; use {!identifiers} for multiple DOIs or other 268 identifier types. Example: ["10.5281/zenodo.1234567"] *) 269val doi : t -> string option 270 271(** Additional identifiers beyond the primary DOI. 272 273 Each identifier has a type (DOI, URL, SWH, other), value, and 274 optional description. Useful for versioned DOIs, Software Heritage 275 identifiers, or repository URLs. *) 276val identifiers : t -> Identifier.t list option 277 278(** Descriptive keywords for the work. 279 280 Help with discoverability and categorization. Example: 281 [["machine learning"; "image processing"; "python"]] *) 282val keywords : t -> string list option 283 284(** The SPDX license identifier(s) for the work. 285 286 Uses {{:https://spdx.org/licenses/}SPDX identifiers}. Multiple 287 licenses imply an OR relationship (user may choose any). 288 Example: ["MIT"], ["Apache-2.0"], or [["GPL-3.0-only"; "MIT"]]. *) 289val license : t -> License.t option 290 291(** A reference to cite instead of the software itself. 292 293 Used for "credit redirection" when authors prefer citation of 294 a related publication (e.g., a methods paper) over the software. 295 Note: Software citation principles recommend citing software 296 directly; use this field judiciously. *) 297val preferred_citation : t -> Reference.t option 298 299(** Works that this software cites or depends upon. 300 301 Functions like a bibliography, listing dependencies, foundational 302 works, or related publications. Each reference includes full 303 bibliographic metadata. *) 304val references : t -> Reference.t list option 305 306(** URL to the repository where the software is developed. 307 308 Typically a version control system URL. For source code repositories, 309 prefer {!repository_code}. *) 310val repository : t -> string option 311 312(** URL to the built/compiled artifact repository. 313 314 For binary distributions, package registries (npm, PyPI, CRAN), 315 or container registries. *) 316val repository_artifact : t -> string option 317 318(** URL to the source code repository. 319 320 Typically a GitHub, GitLab, or similar URL where the source 321 code is publicly accessible. *) 322val repository_code : t -> string option 323 324(** The type of work: [`Software] (default) or [`Dataset]. 325 326 Most CFF files describe software; use [`Dataset] for data packages. *) 327val type_ : t -> Cff_type.t option 328 329(** The URL of the software or dataset homepage. 330 331 A general landing page, documentation site, or project website. *) 332val url : t -> string option 333 334(** The version string of the software or dataset. 335 336 Can be any version format: semantic versioning (["1.2.3"]), 337 date-based (["2024.01"]), or other schemes. *) 338val version : t -> string option 339 340(** {1 Formatting and Codec} *) 341 342(** Pretty-print a CFF value in a human-readable YAML-like format. *) 343val pp : Format.formatter -> t -> unit 344 345(** JSON/YAML codec for serialization and deserialization. 346 347 Used internally by the YAML codec functions. *) 348val jsont : t Jsont.t