A research repository into geolabels, not for wide use yet
at main 846 lines 32 kB view raw
1(** AOH worked example: {i Panthera leo} in the Serengeti ecosystem. 2 3 Demonstrates the full label pipeline from raw observations 4 through synthetic simulation to Area of Habitat calculation, 5 integrating data from: 6 7 - Camera traps (Serengeti Lion Project grid) 8 - GPS collars (Movebank study 1234) 9 - GBIF occurrence records 10 - iNaturalist citizen science observations 11 - IUCN Red List expert range and habitat preferences 12 - Lotka-Volterra population simulation (synthetic) 13 - TESSERA v3.1 habitat classification 14 15 The provenance graph: 16 {v 17 AOH polygon 18 ├── species_range (alpha-shape, measured-only) 19 │ ├── camera trap detections 20 │ ├── GPS collar fixes (Movebank) 21 │ ├── GBIF occurrences 22 │ └── iNaturalist observations 23 ├── IUCN expert range (validation) 24 └── habitat suitability tiles (TESSERA) 25 └── training set 26 ├── all measured occurrences 27 ├── IUCN habitat preferences 28 └── synthetic augmentation (Lotka-Volterra) 29 v} *) 30 31open Terradots 32 33let ed = event_date_of_string 34let c = cell_of_string 35 36(* ══════════════════════════════════════════════════════════ 37 1. Activities — the audit trail 38 39 Each activity links a batch of labels to who/what produced 40 them and when. The [agent] field points to Fairground 41 notebook URIs where applicable. 42 ══════════════════════════════════════════════════════════ *) 43 44let act_field_survey = 45 { activity_id = "act-field-2024"; 46 agent = "orcid:0000-0002-1234-5678"; 47 date = "2024-06-15T08:00:00Z"; 48 description = Some "Serengeti Lion Project 2024 dry-season \ 49 camera trap survey" } 50 51let act_movebank_import = 52 { activity_id = "act-movebank-import"; 53 agent = "fairground:notebook/movebank-ingest:v2"; 54 date = "2024-07-01T12:00:00Z"; 55 description = Some "Bulk import of GPS collar data from \ 56 Movebank study 1234, individuals leo-007 \ 57 and leo-012" } 58 59let act_gbif_import = 60 { activity_id = "act-gbif-import"; 61 agent = "fairground:notebook/gbif-ingest:v3"; 62 date = "2024-07-02T10:00:00Z"; 63 description = Some "GBIF Panthera leo occurrences, East Africa, \ 64 2020-2024" } 65 66let act_inat_import = 67 { activity_id = "act-inat-import"; 68 agent = "fairground:notebook/inat-ingest:v1"; 69 date = "2024-07-02T14:00:00Z"; 70 description = Some "iNaturalist research-grade P. leo observations, \ 71 Serengeti-Mara ecosystem" } 72 73let act_iucn_import = 74 { activity_id = "act-iucn-import"; 75 agent = "fairground:notebook/iucn-ingest:v1"; 76 date = "2024-07-03T09:00:00Z"; 77 description = Some "IUCN Red List Panthera leo assessment: expert \ 78 range polygon and habitat preference codes" } 79 80let act_simulation = 81 { activity_id = "act-sim-lv-001"; 82 agent = "fairground:notebook/lotka-volterra-serengeti:v4@cell-7"; 83 date = "2024-07-10T16:00:00Z"; 84 description = Some "Lotka-Volterra predator-prey simulation, \ 85 lion-zebra-wildebeest, Serengeti parameterisation, \ 86 100-year projection, seed=42" } 87 88let act_training_set = 89 { activity_id = "act-training-2024"; 90 agent = "fairground:notebook/sdm-training:v2"; 91 date = "2024-07-15T10:00:00Z"; 92 description = Some "Assemble training set for P. leo SDM: \ 93 balanced spatial sample with synthetic \ 94 augmentation from Lotka-Volterra run" } 95 96let act_habitat = 97 { activity_id = "act-habitat-2024"; 98 agent = "fairground:notebook/habitat-classify:v3"; 99 date = "2024-07-16T09:00:00Z"; 100 description = Some "Habitat suitability classification from \ 101 TESSERA v3.1 land-cover embeddings, \ 102 thresholded against IUCN habitat codes" } 103 104let act_range = 105 { activity_id = "act-range-2024"; 106 agent = "fairground:notebook/species-range:v2"; 107 date = "2024-07-16T11:00:00Z"; 108 description = Some "Alpha-shape species range from all verified \ 109 occurrences (measured-only, no synthetic)" } 110 111let act_aoh = 112 { activity_id = "act-aoh-2024"; 113 agent = "fairground:notebook/aoh-iucn:v3"; 114 date = "2024-07-16T14:00:00Z"; 115 description = Some "IUCN Area of Habitat: species range intersected \ 116 with suitable habitat tiles" } 117 118(* ══════════════════════════════════════════════════════════ 119 2. Camera trap observations — Serengeti Lion Project 120 121 Fixed sensors in the Serengeti NP grid. Each trigger 122 produces a Point at the trap's surveyed coordinates. 123 Hilbert cells b7a–b7f cover the Serengeti at level 12. 124 ══════════════════════════════════════════════════════════ *) 125 126let trap_01 = 127 make_point 128 ~cell:(c "b7a") ~id:"ct-001" 129 ~x:34.82 ~y:(-2.33) 130 ~observer:"urn:sensor:camera-trap:serengeti-node-17" 131 ~class_dist:[("Panthera leo", 1.0)] ~accuracy_m:5.0 ~confidence:0.97 132 ~event_date:(ed "2024-06-12T05:42:00Z") 133 ~activity:"act-field-2024" 134 ~properties:[ 135 ("image_uri", "s3://slp/ct17/IMG_4821.jpg"); 136 ("individual_count", "3"); 137 ("behaviour", "resting")] 138 () 139 140let trap_02 = 141 make_point 142 ~cell:(c "b7a") ~id:"ct-002" 143 ~x:34.83 ~y:(-2.32) 144 ~observer:"urn:sensor:camera-trap:serengeti-node-17" 145 ~class_dist:[("Panthera leo", 1.0)] ~accuracy_m:5.0 ~confidence:0.92 146 ~event_date:(ed "2024-06-14T19:15:00Z") 147 ~activity:"act-field-2024" 148 ~properties:[ 149 ("image_uri", "s3://slp/ct17/IMG_4903.jpg"); 150 ("individual_count", "1"); 151 ("behaviour", "walking")] 152 () 153 154let trap_03 = 155 make_point 156 ~cell:(c "b7c") ~id:"ct-003" 157 ~x:35.01 ~y:(-2.15) 158 ~observer:"urn:sensor:camera-trap:serengeti-node-42" 159 ~class_dist:[("Panthera leo", 1.0)] ~accuracy_m:5.0 ~confidence:0.88 160 ~event_date:(ed "2024-06-18T03:22:00Z") 161 ~activity:"act-field-2024" 162 ~properties:[ 163 ("image_uri", "s3://slp/ct42/IMG_1207.jpg"); 164 ("individual_count", "2")] 165 () 166 167(** Non-detection: trap triggered by motion but no lion present. 168 These matter for occupancy models — absence data is data. *) 169let trap_04 = 170 make_point 171 ~cell:(c "b7d") ~id:"ct-004" 172 ~x:35.22 ~y:(-2.45) 173 ~observer:"urn:sensor:camera-trap:serengeti-node-55" 174 ~event_date:(ed "2024-06-20T22:10:00Z") 175 ~activity:"act-field-2024" 176 ~properties:[ 177 ("image_uri", "s3://slp/ct55/IMG_0891.jpg"); 178 ("trigger", "motion"); 179 ("species_detected", "none")] 180 () 181 182(* ══════════════════════════════════════════════════════════ 183 3. GPS collar tracks — Movebank study 1234 184 185 Imported via the Movebank registry. Each fix is a Point 186 with the collar as observer and the Movebank event URI as 187 the registry record ([via]). 188 ══════════════════════════════════════════════════════════ *) 189 190(** Individual leo-007: three fixes showing movement NE. *) 191let gps_01 = 192 make_imported 193 ~cell:(c "b7a") ~id:"gps-001" 194 ~geometry:(Point { x = 34.81; y = -2.34 }) 195 ~via:"movebank:study/1234/individual/leo-007/event/98001" 196 ~observer:"urn:sensor:gps:vectronic-vertex-plus-007" 197 ~license:"CC-BY-NC-4.0" 198 ~accuracy_m:3.5 199 ~class_dist:[("Panthera leo", 1.0)] 200 ~event_date:(ed "2024-06-10T06:00:00Z") 201 ~activity:"act-movebank-import" 202 ~properties:[ 203 ("individual_id", "leo-007"); 204 ("fix_type", "3D"); ("hdop", "0.9")] 205 () 206 207let gps_02 = 208 make_imported 209 ~cell:(c "b7a") ~id:"gps-002" 210 ~geometry:(Point { x = 34.84; y = -2.31 }) 211 ~via:"movebank:study/1234/individual/leo-007/event/98002" 212 ~observer:"urn:sensor:gps:vectronic-vertex-plus-007" 213 ~license:"CC-BY-NC-4.0" 214 ~accuracy_m:4.2 215 ~class_dist:[("Panthera leo", 1.0)] 216 ~event_date:(ed "2024-06-10T12:00:00Z") 217 ~activity:"act-movebank-import" 218 ~properties:[ 219 ("individual_id", "leo-007"); 220 ("fix_type", "3D"); ("hdop", "1.1")] 221 () 222 223let gps_03 = 224 make_imported 225 ~cell:(c "b7b") ~id:"gps-003" 226 ~geometry:(Point { x = 34.91; y = -2.28 }) 227 ~via:"movebank:study/1234/individual/leo-007/event/98003" 228 ~observer:"urn:sensor:gps:vectronic-vertex-plus-007" 229 ~license:"CC-BY-NC-4.0" 230 ~accuracy_m:5.1 231 ~class_dist:[("Panthera leo", 1.0)] 232 ~event_date:(ed "2024-06-11T06:00:00Z") 233 ~activity:"act-movebank-import" 234 ~properties:[ 235 ("individual_id", "leo-007"); 236 ("fix_type", "3D"); ("hdop", "1.4")] 237 () 238 239(** Individual leo-012: separate pride, further east. *) 240let gps_04 = 241 make_imported 242 ~cell:(c "b7c") ~id:"gps-004" 243 ~geometry:(Point { x = 35.05; y = -2.10 }) 244 ~via:"movebank:study/1234/individual/leo-012/event/98501" 245 ~observer:"urn:sensor:gps:vectronic-vertex-plus-012" 246 ~license:"CC-BY-NC-4.0" 247 ~accuracy_m:3.0 248 ~class_dist:[("Panthera leo", 1.0)] 249 ~event_date:(ed "2024-06-12T06:00:00Z") 250 ~activity:"act-movebank-import" 251 ~properties:[("individual_id", "leo-012")] 252 () 253 254(* ══════════════════════════════════════════════════════════ 255 4. GBIF occurrence records 256 257 Museum specimens and field surveys aggregated through GBIF. 258 Note the varying accuracy — the 2021 record has 500 m 259 uncertainty (flagged for review in annotations). 260 ══════════════════════════════════════════════════════════ *) 261 262let gbif_01 = 263 make_imported 264 ~cell:(c "b7a") ~id:"gbif-001" 265 ~geometry:(Point { x = 34.85; y = -2.35 }) 266 ~via:"gbif:4023589127" 267 ~license:"CC-BY-4.0" 268 ~accuracy_m:100.0 269 ~class_dist:[("Panthera leo", 1.0)] 270 ~event_date:(ed "2022-08-14") 271 ~activity:"act-gbif-import" 272 ~properties:[ 273 ("gbif_dataset", "serengeti-biodiversity-survey"); 274 ("basis_of_record", "HUMAN_OBSERVATION"); 275 ("recorded_by", "Tanzania Wildlife Research Institute")] 276 () 277 278let gbif_02 = 279 make_imported 280 ~cell:(c "b7e") ~id:"gbif-002" 281 ~geometry:(Point { x = 35.40; y = -2.50 }) 282 ~via:"gbif:4023589999" 283 ~license:"CC-BY-4.0" 284 ~accuracy_m:500.0 285 ~class_dist:[("Panthera leo", 1.0)] 286 ~event_date:(ed "2021") 287 ~activity:"act-gbif-import" 288 ~properties:[ 289 ("gbif_dataset", "ngorongoro-mammal-survey"); 290 ("basis_of_record", "HUMAN_OBSERVATION")] 291 () 292 293(* ══════════════════════════════════════════════════════════ 294 5. iNaturalist citizen science 295 296 Research-grade observations from the iNaturalist platform. 297 The observer is a user URI; the record is the observation URI. 298 ══════════════════════════════════════════════════════════ *) 299 300let inat_01 = 301 make_imported 302 ~cell:(c "b7b") ~id:"inat-001" 303 ~geometry:(Point { x = 34.95; y = -2.20 }) 304 ~via:"inaturalist:observation/182345678" 305 ~observer:"inaturalist:user/safari_dave" 306 ~license:"CC-BY-NC-4.0" 307 ~accuracy_m:50.0 308 ~class_dist:[("Panthera leo", 1.0)] 309 ~confidence:0.95 310 ~event_date:(ed "2023-07-22T16:30:00Z") 311 ~activity:"act-inat-import" 312 ~properties:[ 313 ("quality_grade", "research"); 314 ("num_identifications", "5")] 315 () 316 317(* ══════════════════════════════════════════════════════════ 318 6. IUCN Red List — expert range and habitat preferences 319 320 The IUCN assessment provides two things: 321 (a) An expert-drawn range polygon for the species. 322 (b) Habitat preference codes (IUCN Habitats Classification 323 Scheme) with suitability ratings. 324 325 The range polygon validates the data-driven range; the 326 habitat codes drive the suitability classification. 327 ══════════════════════════════════════════════════════════ *) 328 329(** Expert-drawn range polygon (simplified to bounding extent). *) 330let iucn_range = 331 make_imported 332 ~cell:(c "b70") ~id:"iucn-range-001" 333 ~geometry:(Polygon [ 334 { x = 34.0; y = -3.0 }; 335 { x = 36.0; y = -3.0 }; 336 { x = 36.0; y = -1.0 }; 337 { x = 34.0; y = -1.0 }; 338 { x = 34.0; y = -3.0 }; 339 ]) 340 ~via:"iucn:redlist:22/Panthera-leo:range:2024.1" 341 ~license:"CC-BY-NC-4.0" 342 ~class_dist:[("Panthera leo", 1.0)] 343 ~event_date:(ed "2024") 344 ~activity:"act-iucn-import" 345 ~properties:[ 346 ("iucn_status", "VU"); 347 ("iucn_criteria", "A2abcd"); 348 ("population_trend", "decreasing"); 349 ("range_type", "extant:resident"); 350 ("habitat_codes", "1.5;1.6;2;3;14.1")] 351 () 352 353(** Habitat preference: savanna (IUCN code 2) — major habitat. *) 354let iucn_hab_savanna = 355 make_imported 356 ~cell:(c "b70") ~id:"iucn-hab-001" 357 ~geometry:(Point { x = 35.0; y = -2.0 }) 358 ~via:"iucn:redlist:22/Panthera-leo:habitat:2" 359 ~license:"CC-BY-NC-4.0" 360 ~class_dist:[("habitat-preference:savanna", 1.0)] 361 ~confidence:0.95 362 ~activity:"act-iucn-import" 363 ~properties:[ 364 ("iucn_habitat_code", "2"); 365 ("suitability", "Suitable"); 366 ("major_importance", "Yes")] 367 () 368 369(** Habitat preference: shrubland (IUCN code 3) — minor habitat. *) 370let iucn_hab_shrubland = 371 make_imported 372 ~cell:(c "b70") ~id:"iucn-hab-002" 373 ~geometry:(Point { x = 35.0; y = -2.0 }) 374 ~via:"iucn:redlist:22/Panthera-leo:habitat:3" 375 ~license:"CC-BY-NC-4.0" 376 ~class_dist:[("habitat-preference:shrubland", 1.0)] 377 ~confidence:0.70 378 ~activity:"act-iucn-import" 379 ~properties:[ 380 ("iucn_habitat_code", "3"); 381 ("suitability", "Suitable"); 382 ("major_importance", "No")] 383 () 384 385(* ══════════════════════════════════════════════════════════ 386 7. Synthetic simulation — Lotka-Volterra population dynamics 387 388 Agent-based Lotka-Volterra model producing simulated lion 389 positions in under-sampled areas (Ngorongoro corridor). 390 These augment the SDM training set but are NEVER included 391 in the measured species range. 392 393 The [Simulated] origin keeps them type-level distinct from 394 real observations. Properties carry the scenario parameters 395 for reproducibility. 396 ══════════════════════════════════════════════════════════ *) 397 398let sim_01 = 399 make_simulated 400 ~cell:(c "b7d") ~id:"sim-001" 401 ~geometry:(Point { x = 35.20; y = -2.50 }) 402 ~model:"fairground:notebook/lotka-volterra-serengeti:v4" 403 ~run_id:"lv-run-42" 404 ~class_dist:[("Panthera leo", 1.0)] 405 ~event_date:(ed "2024-06-15T00:00:00Z") 406 ~confidence:0.60 407 ~activity:"act-sim-lv-001" 408 ~properties:[ 409 ("scenario", "baseline-2024"); 410 ("time_step", "150"); 411 ("prey_density_km2", "45.2"); 412 ("seed", "42")] 413 () 414 415let sim_02 = 416 make_simulated 417 ~cell:(c "b7d") ~id:"sim-002" 418 ~geometry:(Point { x = 35.18; y = -2.48 }) 419 ~model:"fairground:notebook/lotka-volterra-serengeti:v4" 420 ~run_id:"lv-run-42" 421 ~class_dist:[("Panthera leo", 1.0)] 422 ~event_date:(ed "2024-06-15T06:00:00Z") 423 ~confidence:0.60 424 ~activity:"act-sim-lv-001" 425 ~properties:[ 426 ("scenario", "baseline-2024"); 427 ("time_step", "151"); 428 ("prey_density_km2", "44.8"); 429 ("seed", "42")] 430 () 431 432(** Drought scenario — prey density drops, lion shifts south. *) 433let sim_03 = 434 make_simulated 435 ~cell:(c "b7e") ~id:"sim-003" 436 ~geometry:(Point { x = 35.45; y = -2.55 }) 437 ~model:"fairground:notebook/lotka-volterra-serengeti:v4" 438 ~run_id:"lv-run-42" 439 ~class_dist:[("Panthera leo", 1.0)] 440 ~event_date:(ed "2024-06-16T00:00:00Z") 441 ~confidence:0.55 442 ~activity:"act-sim-lv-001" 443 ~properties:[ 444 ("scenario", "drought-2024"); 445 ("time_step", "152"); 446 ("prey_density_km2", "28.1"); 447 ("seed", "42")] 448 () 449 450(* ══════════════════════════════════════════════════════════ 451 8. Derivation: training set assembly 452 453 The training set is itself a derived label — it records 454 exactly which labels (measured + synthetic) were selected 455 for model training, and the synthetic fraction. 456 457 This is the provenance anchor for the SDM: you can always 458 ask "which observations trained this model?" 459 ══════════════════════════════════════════════════════════ *) 460 461let all_measured_ids = 462 [ "ct-001"; "ct-002"; "ct-003"; 463 "gps-001"; "gps-002"; "gps-003"; "gps-004"; 464 "gbif-001"; "gbif-002"; 465 "inat-001" ] 466 467let all_synthetic_ids = 468 [ "sim-001"; "sim-002"; "sim-003" ] 469 470let training_set = 471 make_derived 472 ~cell:(c "b70") ~id:"ts-001" 473 ~geometry:(Polygon [ 474 { x = 34.0; y = -3.0 }; 475 { x = 36.0; y = -3.0 }; 476 { x = 36.0; y = -1.0 }; 477 { x = 34.0; y = -1.0 }; 478 { x = 34.0; y = -3.0 }; 479 ]) 480 ~sources:(all_measured_ids @ all_synthetic_ids) 481 ~method_:"training-set:balanced-spatial-sample" 482 ~class_dist:[("training-set:Panthera-leo:sdm-2024", 1.0)] 483 ~activity:"act-training-2024" 484 ~properties:[ 485 ("n_measured", string_of_int (List.length all_measured_ids)); 486 ("n_synthetic", string_of_int (List.length all_synthetic_ids)); 487 ("synthetic_fraction", "0.23"); 488 ("spatial_extent", "34.0,-3.0,36.0,-1.0"); 489 ("temporal_window", "2021/2024"); 490 ("tessera_model", "tessera:v3.1:east-africa")] 491 () 492 493(* ══════════════════════════════════════════════════════════ 494 9. Derivation: habitat suitability from TESSERA 495 496 Each TESSERA tile is classified as suitable or unsuitable 497 based on its land-cover embedding and the IUCN habitat 498 preference codes. The [sources] link back to the IUCN 499 habitat labels. 500 ══════════════════════════════════════════════════════════ *) 501 502let hab_sources = ["iucn-hab-001"; "iucn-hab-002"] 503 504(** Core Serengeti savanna — highly suitable. *) 505let hab_01 = 506 make_derived 507 ~cell:(c "b7a") ~id:"hab-001" 508 ~geometry:(Polygon [ 509 { x = 34.80; y = -2.40 }; 510 { x = 34.90; y = -2.40 }; 511 { x = 34.90; y = -2.30 }; 512 { x = 34.80; y = -2.30 }; 513 { x = 34.80; y = -2.40 }; 514 ]) 515 ~sources:hab_sources 516 ~method_:"habitat-classify:tessera-v3.1:threshold-0.6" 517 ~confidence:0.91 518 ~class_dist:[("savanna", 0.78); ("shrubland", 0.13); ("other", 0.09)] 519 ~activity:"act-habitat-2024" 520 ~properties:[ 521 ("tessera_tile", "b7a:034.80:-002.40"); 522 ("dominant_landcover", "savanna")] 523 () 524 525(** Savanna-shrubland mosaic — moderate suitability. *) 526let hab_02 = 527 make_derived 528 ~cell:(c "b7d") ~id:"hab-002" 529 ~geometry:(Polygon [ 530 { x = 35.10; y = -2.60 }; 531 { x = 35.20; y = -2.60 }; 532 { x = 35.20; y = -2.50 }; 533 { x = 35.10; y = -2.50 }; 534 { x = 35.10; y = -2.60 }; 535 ]) 536 ~sources:hab_sources 537 ~method_:"habitat-classify:tessera-v3.1:threshold-0.6" 538 ~confidence:0.68 539 ~class_dist:[("savanna", 0.45); ("shrubland", 0.30); ("cropland", 0.25)] 540 ~activity:"act-habitat-2024" 541 ~properties:[ 542 ("tessera_tile", "b7d:035.10:-002.60"); 543 ("dominant_landcover", "savanna-shrubland-mosaic")] 544 () 545 546(** Agricultural land — unsuitable, excluded from AOH. *) 547let hab_03 = 548 make_derived 549 ~cell:(c "b7f") ~id:"hab-003" 550 ~geometry:(Polygon [ 551 { x = 35.80; y = -1.20 }; 552 { x = 35.90; y = -1.20 }; 553 { x = 35.90; y = -1.10 }; 554 { x = 35.80; y = -1.10 }; 555 { x = 35.80; y = -1.20 }; 556 ]) 557 ~sources:hab_sources 558 ~method_:"habitat-classify:tessera-v3.1:threshold-0.6" 559 ~confidence:0.12 560 ~class_dist:[("cropland", 0.72); ("settlement", 0.18); ("savanna", 0.10)] 561 ~activity:"act-habitat-2024" 562 ~properties:[ 563 ("tessera_tile", "b7f:035.80:-001.20"); 564 ("dominant_landcover", "cropland")] 565 () 566 567(* ══════════════════════════════════════════════════════════ 568 10. Derivation: species range from occurrences 569 570 Alpha-shape computed from measured-only data. Synthetic 571 labels are explicitly excluded — the range must reflect 572 where lions have actually been observed. 573 574 The [is_simulated] accessor is used by the range pipeline 575 to filter out synthetic augmentation. 576 ══════════════════════════════════════════════════════════ *) 577 578let species_range = 579 make_derived 580 ~cell:(c "b70") ~id:"range-001" 581 ~geometry:(Polygon [ 582 { x = 34.75; y = -2.60 }; 583 { x = 35.50; y = -2.60 }; 584 { x = 35.50; y = -2.00 }; 585 { x = 35.10; y = -1.90 }; 586 { x = 34.75; y = -2.10 }; 587 { x = 34.75; y = -2.60 }; 588 ]) 589 ~sources:all_measured_ids (* no sim-* labels *) 590 ~method_:"alpha-shape:alpha-0.005" 591 ~class_dist:[("range:Panthera leo", 1.0)] 592 ~activity:"act-range-2024" 593 ~properties:[ 594 ("range_km2", "4850"); 595 ("n_occurrences", string_of_int (List.length all_measured_ids)); 596 ("excludes_synthetic", "true")] 597 () 598 599(* ══════════════════════════════════════════════════════════ 600 11. Derivation: Area of Habitat 601 602 The final AOH is the intersection of: 603 - the data-driven species range (measured-only) 604 - the TESSERA habitat suitability tiles (suitable only) 605 - validated against the IUCN expert range 606 607 The result is a Multi polygon — disconnected habitat 608 patches within the range. Properties carry the IUCN 609 assessment metadata and the key metrics. 610 611 When TESSERA is retrained (v3.1 → v3.2), the habitat 612 tiles change, so AOH recomputes. The new AOH label gets 613 a new activity; both versions coexist for comparison. 614 ══════════════════════════════════════════════════════════ *) 615 616let aoh = 617 make_derived 618 ~cell:(c "b70") ~id:"aoh-001" 619 ~geometry:(Multi [ 620 (* Patch 1: core Serengeti savanna *) 621 Polygon [ 622 { x = 34.80; y = -2.40 }; 623 { x = 35.20; y = -2.40 }; 624 { x = 35.20; y = -2.10 }; 625 { x = 34.80; y = -2.10 }; 626 { x = 34.80; y = -2.40 }; 627 ]; 628 (* Patch 2: southern extension into Ngorongoro *) 629 Polygon [ 630 { x = 35.10; y = -2.60 }; 631 { x = 35.40; y = -2.60 }; 632 { x = 35.40; y = -2.40 }; 633 { x = 35.10; y = -2.40 }; 634 { x = 35.10; y = -2.60 }; 635 ]; 636 ]) 637 ~sources:[ 638 "range-001"; (* data-driven species range *) 639 "iucn-range-001"; (* IUCN expert range — validation *) 640 "hab-001"; "hab-002"; (* suitable habitat tiles *) 641 (* hab-003 excluded: unsuitable cropland *) 642 ] 643 ~method_:"aoh:iucn-2022:range-intersect-habitat" 644 ~class_dist:[("aoh:Panthera leo", 1.0)] 645 ~activity:"act-aoh-2024" 646 ~properties:[ 647 (* AOH metrics *) 648 ("aoh_km2", "3420"); 649 ("range_km2", "4850"); 650 ("habitat_proportion", "0.705"); 651 ("unsuitable_excluded_km2", "1430"); 652 ("dominant_exclusion", "cropland"); 653 (* IUCN assessment context *) 654 ("iucn_status", "VU"); 655 ("iucn_criteria", "A2abcd"); 656 ("population_trend", "decreasing"); 657 (* Model provenance *) 658 ("tessera_model", "tessera:v3.1:east-africa"); 659 ("synthetic_in_sdm_training", "true"); 660 ("synthetic_fraction_in_training", "0.23")] 661 () 662 663(* ══════════════════════════════════════════════════════════ 664 12. Document assembly 665 ══════════════════════════════════════════════════════════ *) 666 667let doc = 668 { crs = wgs84; 669 level = 12; 670 provenance = [ 671 act_field_survey; 672 act_movebank_import; 673 act_gbif_import; 674 act_inat_import; 675 act_iucn_import; 676 act_simulation; 677 act_training_set; 678 act_habitat; 679 act_range; 680 act_aoh; 681 ]; 682 labels = [ 683 (* Camera traps *) 684 trap_01; trap_02; trap_03; trap_04; 685 (* GPS collars — Movebank *) 686 gps_01; gps_02; gps_03; gps_04; 687 (* GBIF *) 688 gbif_01; gbif_02; 689 (* iNaturalist *) 690 inat_01; 691 (* IUCN Red List *) 692 iucn_range; iucn_hab_savanna; iucn_hab_shrubland; 693 (* Synthetic — Lotka-Volterra *) 694 sim_01; sim_02; sim_03; 695 (* Derivations *) 696 training_set; 697 hab_01; hab_02; hab_03; 698 species_range; 699 aoh; 700 ]; 701 annotations = [ 702 { id = "ann-001"; 703 text = "Camera trap ct-001 and GPS fix gps-001 are 1.4 km \ 704 apart on the same day — likely same pride. Consider \ 705 merge after dry-season survey completes."; 706 anchors = ["ct-001"; "gps-001"] }; 707 { id = "ann-002"; 708 text = "GBIF gbif-002 has 500 m uncertainty and only year-level \ 709 temporal precision. Flag for review before including \ 710 in high-resolution analyses."; 711 anchors = ["gbif-002"] }; 712 { id = "ann-003"; 713 text = "Synthetic labels sim-001..sim-003 augment the under-sampled \ 714 Ngorongoro corridor. Weight reduced to 0.5x in training \ 715 set assembly. Not included in species range computation."; 716 anchors = ["sim-001"; "sim-002"; "sim-003"] }; 717 { id = "ann-004"; 718 text = "AOH shows 70.5% of range is suitable habitat. Main \ 719 exclusion is cropland encroachment on the eastern boundary. \ 720 Compare with IUCN 2019 assessment (was 78%)."; 721 anchors = ["aoh-001"] }; 722 ]; 723 groups = [ 724 { id = "grp-field-2024"; 725 activity = Some "act-field-2024"; 726 members = ["ct-001"; "ct-002"; "ct-003"; "ct-004"] }; 727 { id = "grp-leo-007-track"; 728 activity = Some "act-movebank-import"; 729 members = ["gps-001"; "gps-002"; "gps-003"] }; 730 { id = "grp-leo-012-track"; 731 activity = Some "act-movebank-import"; 732 members = ["gps-004"] }; 733 { id = "grp-synthetic-lv42"; 734 activity = Some "act-sim-lv-001"; 735 members = ["sim-001"; "sim-002"; "sim-003"] }; 736 { id = "grp-iucn-habitat-prefs"; 737 activity = Some "act-iucn-import"; 738 members = ["iucn-hab-001"; "iucn-hab-002"] }; 739 ]; 740 } 741 742(* ══════════════════════════════════════════════════════════ 743 13. Queries — demonstrating the provenance graph 744 745 These functions show how a wiki renderer or analysis 746 pipeline would traverse the label graph. 747 ══════════════════════════════════════════════════════════ *) 748 749(** Find a label by ID. *) 750let find id = 751 List.find (fun (l : label) -> l.id = id) doc.labels 752 753(** All labels in a Hilbert cell. *) 754let in_cell c = 755 List.filter (fun (l : label) -> l.cell = c) doc.labels 756 757(** All measured (non-synthetic, non-derived) labels. *) 758let measured_only () = 759 List.filter (fun (l : label) -> 760 match l.origin with Measured _ -> true | _ -> false) 761 doc.labels 762 763(** All simulated labels. *) 764let synthetic_only () = 765 List.filter is_simulated doc.labels 766 767(** Immediate sources of a derived label. *) 768let sources_of_label l = 769 List.filter_map 770 (fun src_id -> 771 match List.find_opt (fun (l : label) -> l.id = src_id) doc.labels with 772 | Some src -> Some src 773 | None -> None) 774 (sources_of l) 775 776(** Transitive closure: all labels reachable through [sources]. *) 777let rec all_ancestors l = 778 let immediate = sources_of_label l in 779 let deeper = List.concat_map all_ancestors immediate in 780 immediate @ deeper 781 782(** How many synthetic labels influenced this derivation? *) 783let synthetic_ancestor_count l = 784 all_ancestors l 785 |> List.filter is_simulated 786 |> List.length 787 788(** Activity record for a label. *) 789let activity_of (l : label) = 790 match l.activity with 791 | None -> None 792 | Some aid -> 793 List.find_opt (fun a -> a.activity_id = aid) doc.provenance 794 795(* ══════════════════════════════════════════════════════════ 796 14. Main — exercise the provenance queries 797 ══════════════════════════════════════════════════════════ *) 798 799let () = 800 let n_labels = List.length doc.labels in 801 let n_measured = List.length (measured_only ()) in 802 let n_synthetic = List.length (synthetic_only ()) in 803 let n_derived = n_labels - n_measured - n_synthetic in 804 Printf.printf "Terradots AOH Example: Panthera leo, Serengeti\n"; 805 Printf.printf "══════════════════════════════════════════════\n"; 806 Printf.printf "CRS: %s Hilbert level: %d\n" doc.crs doc.level; 807 Printf.printf "Labels: %d total (%d measured, %d synthetic, %d derived)\n" 808 n_labels n_measured n_synthetic n_derived; 809 Printf.printf "Activities: %d\n" (List.length doc.provenance); 810 Printf.printf "Annotations: %d\n" (List.length doc.annotations); 811 Printf.printf "Groups: %d\n\n" (List.length doc.groups); 812 813 (* AOH provenance *) 814 let aoh_label = find "aoh-001" in 815 Printf.printf "AOH label: %s\n" (label_name aoh_label); 816 let props key = 817 List.assoc_opt key aoh_label.properties 818 |> Option.value ~default:"?" in 819 Printf.printf " AOH: %s km² / %s km² range = %s suitable\n" 820 (props "aoh_km2") (props "range_km2") (props "habitat_proportion"); 821 Printf.printf " IUCN status: %s (%s), trend: %s\n" 822 (props "iucn_status") (props "iucn_criteria") 823 (props "population_trend"); 824 Printf.printf " TESSERA model: %s\n" (props "tessera_model"); 825 Printf.printf " Synthetic in training: %s (fraction: %s)\n\n" 826 (props "synthetic_in_sdm_training") 827 (props "synthetic_fraction_in_training"); 828 829 (* Provenance depth *) 830 let ancestors = all_ancestors aoh_label in 831 let n_syn_ancestors = synthetic_ancestor_count aoh_label in 832 Printf.printf "Provenance graph from AOH:\n"; 833 Printf.printf " Reachable labels: %d\n" (List.length ancestors); 834 Printf.printf " Of which synthetic: %d\n" n_syn_ancestors; 835 836 (* Activity for AOH *) 837 (match activity_of aoh_label with 838 | Some a -> 839 Printf.printf " Activity: %s\n" a.activity_id; 840 Printf.printf " Agent: %s\n" a.agent; 841 Printf.printf " Date: %s\n" a.date 842 | None -> ()); 843 844 (* Spatial query *) 845 Printf.printf "\nLabels in cell b7a: %d\n" 846 (List.length (in_cell (c "b7a")))