implement PreventDuplicateTransitions rule and improve API tests

+11 -4

.claude/commands/implement-orchestration-rule.md

··· 2 2 3 3 add a new rule for flow run state transitions. 4 4 5 - ## clarify first 5 + ## gather context first 6 + 7 + search these locations (use Task tool with subagent_type=Explore): 8 + 9 + 1. **python reference**: `~/github.com/prefecthq/prefect/src/prefect/server/orchestration/` - find the rule class, understand from_states/to_states, before_transition logic 10 + 2. **existing zig rules**: `src/orchestration/flow_rules.zig` - see pattern for similar rules 11 + 3. **RuleContext**: `src/orchestration/rules.zig` - check available fields/methods 6 12 7 - 1. **from_states / to_states**: which transitions trigger this rule? 8 - 2. **action**: reject, wait, abort, or modify context fields? 9 - 3. **side effects**: any database updates beyond state? 13 + clarify: 14 + - **from_states / to_states**: which transitions trigger this rule? 15 + - **action**: reject, wait, abort, or modify context fields? 16 + - **side effects**: any database updates beyond state? 10 17 11 18 ## steps 12 19

+13

.claude/commands/update-database-schema.md

··· 2 2 3 3 add or modify tables/columns via the migration system. 4 4 5 + ## gather context first 6 + 7 + search these locations (use Task tool with subagent_type=Explore): 8 + 9 + 1. **python ORM models**: `~/github.com/prefecthq/prefect/src/prefect/server/database/orm_models.py` - see column types, relationships 10 + 2. **existing zig schema**: `src/db/migrations/001_initial/` - see current table structure 11 + 3. **zig db layer**: `src/db/flow_runs.zig` (or relevant entity) - see queries that need updates 12 + 13 + clarify: 14 + - what columns/tables are needed? 15 + - what data types? (check dialect differences below) 16 + - are there new indexes or constraints? 17 + 5 18 ## steps 6 19 7 20 ### 1. create migration files

+1 -1

ROADMAP.md

··· 140 140 - [x] CopyScheduledTime - copy scheduled_time when SCHEDULED → PENDING 141 141 - [x] WaitForScheduledTime - delay transition if scheduled_time in future 142 142 - [x] RetryFailedFlows - schedule retry when RUNNING → FAILED if retries available 143 - - [ ] PreventDuplicateTransitions - idempotency via transition_id 143 + - [x] PreventDuplicateTransitions - idempotency via transition_id 144 144 - [ ] HandleFlowTerminalStateTransitions - prevent leaving completed with persisted data 145 145 - [ ] HandlePausingFlows - govern RUNNING → PAUSED 146 146 - [ ] HandleResumingPausedFlows - govern PAUSED → RUNNING

+2 -2

loq.toml

··· 10 10 11 11 [[rules]] 12 12 path = "scripts/test-api-sequence" 13 - max_lines = 1400 13 + max_lines = 1800 14 14 15 15 [[rules]] 16 16 path = "src/api/work_pools.zig" ··· 22 22 23 23 [[rules]] 24 24 path = "src/orchestration/flow_rules.zig" 25 - max_lines = 650 25 + max_lines = 750 26 26 27 27 [[rules]] 28 28 path = "src/api/flow_runs.zig"

+625 -196

scripts/test-api-sequence

··· 245 245 return True 246 246 247 247 248 - def test_orchestration_rules(client: CountingClient) -> bool: 249 - """Test orchestration rules (PreventPendingTransitions).""" 250 - if not QUIET: 251 - console.print(f"server: {BASE_URL}\n") 248 + def test_flow_with_task_runs(client: CountingClient) -> bool: 249 + """Test flow execution with nested task runs - simulates real flow execution.""" 252 250 253 - # 1. create flow 254 - if not QUIET: 255 - console.print("[bold]1. create flow[/bold]") 256 - resp = client.post("/flows/", json={"name": f"orchestration-test-{uuid.uuid4().hex[:8]}"}) 257 - if resp.status_code not in (200, 201): 258 - if not QUIET: 259 - console.print(f"[red]FAIL[/red]: create flow {resp.status_code}") 251 + def fail(msg: str) -> bool: 252 + if not QUIET: console.print(f"[red]FAIL[/red]: {msg}") 260 253 return False 261 - flow_id = resp.json()["id"] 262 - if not QUIET: 263 - console.print(f" flow_id: {flow_id}") 254 + 255 + def log(msg: str) -> None: 256 + if not QUIET: console.print(msg) 257 + 258 + # setup: create flow 259 + log("[bold]setup: create flow[/bold]") 260 + resp = client.post("/flows/", json={"name": f"flow-with-tasks-{uuid.uuid4().hex[:8]}"}) 261 + if resp.status_code not in (200, 201): return fail(f"create flow {resp.status_code}") 262 + flow_id = resp.json().get("id") 263 + log(f" flow_id: {flow_id}") 264 264 265 - # 2. create flow run in PENDING state 266 - if not QUIET: 267 - console.print("\n[bold]2. create flow run (PENDING)[/bold]") 265 + # create flow run 266 + log("[bold]1. create flow run[/bold]") 268 267 resp = client.post("/flow_runs/", json={ 269 268 "flow_id": flow_id, 270 - "name": f"orch-run-{uuid.uuid4().hex[:8]}", 269 + "name": f"run-with-tasks-{uuid.uuid4().hex[:8]}", 271 270 "state": {"type": "PENDING", "name": "Pending"}, 272 271 }) 273 - if resp.status_code not in (200, 201): 274 - if not QUIET: 275 - console.print(f"[red]FAIL[/red]: create flow run {resp.status_code}") 276 - return False 277 - flow_run_id = resp.json()["id"] 278 - if not QUIET: 279 - console.print(f" flow_run_id: {flow_run_id}") 272 + if resp.status_code not in (200, 201): return fail(f"create flow run {resp.status_code}") 273 + flow_run_id = resp.json().get("id") 274 + log(f" flow_run_id: {flow_run_id}") 280 275 281 - # 3. try PENDING → PENDING (should be REJECT) 282 - if not QUIET: 283 - console.print("\n[bold]3. PENDING → PENDING (expect REJECT)[/bold]") 276 + # start flow run 277 + log("[bold]2. start flow run (RUNNING)[/bold]") 284 278 resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 279 + "state": {"type": "RUNNING", "name": "Running"}, 280 + }) 281 + if resp.status_code not in (200, 201) or resp.json().get("status") != "ACCEPT": 282 + return fail(f"start flow run failed") 283 + log(f" flow run started") 284 + 285 + # create and execute task 1 286 + log("[bold]3. execute task 1: extract[/bold]") 287 + resp = client.post("/task_runs/", json={ 288 + "flow_run_id": flow_run_id, 289 + "task_key": "extract_data", 290 + "dynamic_key": "extract-0", 291 + "name": "extract_data-0", 285 292 "state": {"type": "PENDING", "name": "Pending"}, 286 293 }) 287 - if resp.status_code not in (200, 201): 288 - if not QUIET: 289 - console.print(f"[red]FAIL[/red]: unexpected status code {resp.status_code}") 290 - return False 291 - result = resp.json() 292 - status = result.get("status") 293 - if status != "REJECT": 294 - if not QUIET: 295 - console.print(f"[red]FAIL[/red]: expected REJECT, got {status}") 296 - return False 297 - if not QUIET: 298 - reason = result.get("details", {}).get("reason", "") 299 - console.print(f" [green]status: {status} (correct)[/green]") 300 - console.print(f" reason: {reason}") 294 + if resp.status_code not in (200, 201): return fail(f"create task 1 {resp.status_code}") 295 + task1_id = resp.json().get("id") 301 296 302 - # 4. PENDING → RUNNING (should be ACCEPT) 303 - if not QUIET: 304 - console.print("\n[bold]4. PENDING → RUNNING (expect ACCEPT)[/bold]") 305 - resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 297 + # run task 1 298 + resp = client.post(f"/task_runs/{task1_id}/set_state", json={ 306 299 "state": {"type": "RUNNING", "name": "Running"}, 307 300 }) 308 - if resp.status_code not in (200, 201): 309 - if not QUIET: 310 - console.print(f"[red]FAIL[/red]: {resp.status_code}") 311 - return False 312 - result = resp.json() 313 - status = result.get("status") 314 - if status != "ACCEPT": 315 - if not QUIET: 316 - console.print(f"[red]FAIL[/red]: expected ACCEPT, got {status}") 317 - return False 318 - if not QUIET: 319 - console.print(f" [green]status: {status} (correct)[/green]") 301 + if resp.status_code not in (200, 201): return fail(f"task 1 RUNNING {resp.status_code}") 320 302 321 - # 5. try RUNNING → PENDING (should be REJECT) 322 - if not QUIET: 323 - console.print("\n[bold]5. RUNNING → PENDING (expect REJECT)[/bold]") 324 - resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 303 + resp = client.post(f"/task_runs/{task1_id}/set_state", json={ 304 + "state": {"type": "COMPLETED", "name": "Completed"}, 305 + }) 306 + if resp.status_code not in (200, 201): return fail(f"task 1 COMPLETED {resp.status_code}") 307 + log(f" task 1 completed") 308 + 309 + # create and execute task 2 (depends on task 1) 310 + log("[bold]4. execute task 2: transform[/bold]") 311 + resp = client.post("/task_runs/", json={ 312 + "flow_run_id": flow_run_id, 313 + "task_key": "transform_data", 314 + "dynamic_key": "transform-0", 315 + "name": "transform_data-0", 325 316 "state": {"type": "PENDING", "name": "Pending"}, 326 317 }) 327 - if resp.status_code not in (200, 201): 328 - if not QUIET: 329 - console.print(f"[red]FAIL[/red]: unexpected status code {resp.status_code}") 330 - return False 331 - result = resp.json() 332 - status = result.get("status") 333 - if status != "REJECT": 334 - if not QUIET: 335 - console.print(f"[red]FAIL[/red]: expected REJECT, got {status}") 336 - return False 337 - if not QUIET: 338 - console.print(f" [green]status: {status} (correct)[/green]") 318 + if resp.status_code not in (200, 201): return fail(f"create task 2 {resp.status_code}") 319 + task2_id = resp.json().get("id") 339 320 340 - # 6. verify run is still RUNNING (reject didn't change state) 341 - if not QUIET: 342 - console.print("\n[bold]6. verify state unchanged after REJECT[/bold]") 343 - resp = client.get(f"/flow_runs/{flow_run_id}") 344 - if resp.status_code != 200: 345 - return False 346 - actual_state = resp.json().get("state_type") 347 - if actual_state != "RUNNING": 348 - if not QUIET: 349 - console.print(f"[red]FAIL[/red]: expected RUNNING, got {actual_state}") 350 - return False 351 - if not QUIET: 352 - console.print(f" [green]state: {actual_state} (correct - unchanged)[/green]") 353 - 354 - # 7. complete normally 355 - if not QUIET: 356 - console.print("\n[bold]7. RUNNING → COMPLETED (expect ACCEPT)[/bold]") 357 - resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 321 + resp = client.post(f"/task_runs/{task2_id}/set_state", json={ 322 + "state": {"type": "RUNNING", "name": "Running"}, 323 + }) 324 + resp = client.post(f"/task_runs/{task2_id}/set_state", json={ 358 325 "state": {"type": "COMPLETED", "name": "Completed"}, 359 326 }) 360 - if resp.status_code not in (200, 201): 361 - return False 362 - status = resp.json().get("status") 363 - if status != "ACCEPT": 364 - if not QUIET: 365 - console.print(f"[red]FAIL[/red]: expected ACCEPT, got {status}") 366 - return False 367 - if not QUIET: 368 - console.print(f" [green]status: {status} (correct)[/green]") 327 + if resp.status_code not in (200, 201): return fail(f"task 2 COMPLETED {resp.status_code}") 328 + log(f" task 2 completed") 369 329 370 - # ========================================================================= 371 - # CopyScheduledTime rule tests 372 - # ========================================================================= 330 + # create and execute task 3 331 + log("[bold]5. execute task 3: load[/bold]") 332 + resp = client.post("/task_runs/", json={ 333 + "flow_run_id": flow_run_id, 334 + "task_key": "load_data", 335 + "dynamic_key": "load-0", 336 + "name": "load_data-0", 337 + "state": {"type": "PENDING", "name": "Pending"}, 338 + }) 339 + if resp.status_code not in (200, 201): return fail(f"create task 3 {resp.status_code}") 340 + task3_id = resp.json().get("id") 373 341 374 - # 8. create a SCHEDULED flow run with next_scheduled_start_time 375 - scheduled_time = "2025-06-15T10:00:00Z" 376 - if not QUIET: 377 - console.print(f"\n[bold]8. create SCHEDULED run (next_scheduled_start_time={scheduled_time})[/bold]") 378 - resp = client.post("/flow_runs/", json={ 379 - "flow_id": flow_id, 380 - "name": f"scheduled-run-{uuid.uuid4().hex[:8]}", 381 - "state": {"type": "SCHEDULED", "name": "Scheduled"}, 382 - "next_scheduled_start_time": scheduled_time, 342 + resp = client.post(f"/task_runs/{task3_id}/set_state", json={ 343 + "state": {"type": "RUNNING", "name": "Running"}, 344 + }) 345 + resp = client.post(f"/task_runs/{task3_id}/set_state", json={ 346 + "state": {"type": "COMPLETED", "name": "Completed"}, 383 347 }) 384 - if resp.status_code not in (200, 201): 385 - if not QUIET: 386 - console.print(f"[red]FAIL[/red]: create scheduled run {resp.status_code}") 387 - return False 388 - scheduled_run_id = resp.json()["id"] 389 - if not QUIET: 390 - console.print(f" scheduled_run_id: {scheduled_run_id}") 348 + if resp.status_code not in (200, 201): return fail(f"task 3 COMPLETED {resp.status_code}") 349 + log(f" task 3 completed") 391 350 392 - # 9. transition SCHEDULED → PENDING (CopyScheduledTime should copy scheduled_time) 393 - if not QUIET: 394 - console.print("\n[bold]9. SCHEDULED → PENDING (expect scheduled_time copied)[/bold]") 395 - resp = client.post(f"/flow_runs/{scheduled_run_id}/set_state", json={ 396 - "state": {"type": "PENDING", "name": "Pending"}, 351 + # complete flow run 352 + log("[bold]6. complete flow run[/bold]") 353 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 354 + "state": {"type": "COMPLETED", "name": "Completed"}, 397 355 }) 398 - if resp.status_code not in (200, 201): 399 - if not QUIET: 400 - console.print(f"[red]FAIL[/red]: set_state {resp.status_code}") 401 - return False 402 - status = resp.json().get("status") 403 - if status != "ACCEPT": 404 - if not QUIET: 405 - console.print(f"[red]FAIL[/red]: expected ACCEPT, got {status}") 406 - return False 407 - if not QUIET: 408 - console.print(f" [green]status: {status} (correct)[/green]") 356 + if resp.status_code not in (200, 201) or resp.json().get("status") != "ACCEPT": 357 + return fail(f"complete flow run failed") 358 + log(f" [green]flow run completed[/green]") 409 359 410 - # 10. verify expected_start_time was set from next_scheduled_start_time 411 - if not QUIET: 412 - console.print("\n[bold]10. verify expected_start_time copied[/bold]") 413 - resp = client.get(f"/flow_runs/{scheduled_run_id}") 414 - if resp.status_code != 200: 415 - return False 416 - run_data = resp.json() 417 - expected_start = run_data.get("expected_start_time") 418 - if expected_start != scheduled_time: 419 - if not QUIET: 420 - console.print(f"[red]FAIL[/red]: expected_start_time={expected_start}, expected {scheduled_time}") 421 - return False 422 - if not QUIET: 423 - console.print(f" [green]expected_start_time: {expected_start} (correct)[/green]") 360 + # verify task runs can be read individually and are completed 361 + log("[bold]7. verify task runs completed[/bold]") 362 + for task_id, task_name in [(task1_id, "extract"), (task2_id, "transform"), (task3_id, "load")]: 363 + resp = client.get(f"/task_runs/{task_id}") 364 + if resp.status_code != 200: return fail(f"read task {task_name} {resp.status_code}") 365 + task_run = resp.json() 366 + if task_run.get("flow_run_id") != flow_run_id: 367 + return fail(f"task {task_name} not linked to flow run") 368 + if task_run.get("state_type") != "COMPLETED": 369 + return fail(f"task {task_name} not COMPLETED") 370 + log(f" [green]all 3 tasks completed and linked to flow run[/green]") 424 371 425 372 return True 426 373 ··· 1035 982 def log(msg: str) -> None: 1036 983 if not QUIET: console.print(msg) 1037 984 985 + def wait_for_runs(deployment_id: str, min_count: int, max_wait: float = 10.0) -> int: 986 + """Poll until we have at least min_count runs, or timeout (client-side filter).""" 987 + start = time_mod.time() 988 + while time_mod.time() - start < max_wait: 989 + resp = client.post("/flow_runs/filter", json={"limit": 100}) 990 + if resp.status_code == 200: 991 + all_runs = resp.json() 992 + matching = [r for r in all_runs if r.get("deployment_id") == deployment_id] 993 + if len(matching) >= min_count: 994 + return len(matching) 995 + time_mod.sleep(0.5) 996 + return 0 997 + 1038 998 # setup: create flow, work pool, deployment with interval schedule 1039 999 log("[bold]setup[/bold]") 1040 1000 resp = client.post("/flows/", json={"name": f"idem-flow-{uuid.uuid4().hex[:8]}"}) ··· 1058 1018 deployment_id = deployment.get("id") 1059 1019 log(f" deployment: {deployment_id}") 1060 1020 1061 - # wait for scheduler to run once (default 5s interval) 1062 - log("[bold]waiting for scheduler (7s)...[/bold]") 1063 - time_mod.sleep(7) 1064 - 1065 - # count runs after first scheduler tick 1066 - resp = client.post("/flow_runs/filter", json={ 1067 - "flow_runs": {"deployment_id": {"any_": [deployment_id]}}, 1068 - "limit": 100, 1069 - }) 1070 - if resp.status_code != 200: return fail(f"filter flow_runs {resp.status_code}") 1071 - runs_after_first = resp.json() 1072 - count_after_first = len(runs_after_first) 1021 + # poll for scheduler to create first run (instead of fixed sleep) 1022 + log("[bold]waiting for scheduler to create runs...[/bold]") 1023 + count_after_first = wait_for_runs(deployment_id, min_count=1) 1073 1024 log(f" runs after first tick: {count_after_first}") 1074 1025 1075 1026 if count_after_first == 0: 1076 - return fail("scheduler did not create any runs") 1077 - 1078 - # wait for scheduler to run again 1079 - log("[bold]waiting for second scheduler tick (7s)...[/bold]") 1080 - time_mod.sleep(7) 1027 + return fail("scheduler did not create any runs (timeout)") 1081 1028 1082 - # count runs after second scheduler tick 1083 - resp = client.post("/flow_runs/filter", json={ 1084 - "flow_runs": {"deployment_id": {"any_": [deployment_id]}}, 1085 - "limit": 100, 1086 - }) 1087 - if resp.status_code != 200: return fail(f"filter flow_runs {resp.status_code}") 1088 - runs_after_second = resp.json() 1089 - count_after_second = len(runs_after_second) 1090 - log(f" runs after second tick: {count_after_second}") 1029 + # verify idempotency: poll for a few seconds to confirm count stays stable 1030 + # if scheduler creates duplicates, count would increase during this window 1031 + log("[bold]verifying idempotency (polling to confirm count stable)...[/bold]") 1032 + stable_checks = 0 1033 + for _ in range(6): # 6 checks over ~3 seconds 1034 + time_mod.sleep(0.5) 1035 + resp = client.post("/flow_runs/filter", json={"limit": 100}) 1036 + if resp.status_code != 200: continue 1037 + all_runs = resp.json() 1038 + matching = [r for r in all_runs if r.get("deployment_id") == deployment_id] 1039 + current_count = len(matching) 1040 + if current_count == count_after_first: 1041 + stable_checks += 1 1042 + elif current_count > count_after_first: 1043 + return fail(f"idempotency failed: {count_after_first} -> {current_count} runs") 1091 1044 1092 - # key test: same number of runs means idempotency works 1093 - # (scheduler shouldn't create duplicates for same scheduled times) 1094 - if count_after_second != count_after_first: 1095 - return fail(f"idempotency failed: {count_after_first} -> {count_after_second} runs") 1096 - log(f" [green]idempotency verified: count unchanged[/green]") 1045 + log(f" stable count checks: {stable_checks}/6") 1046 + log(f" [green]idempotency verified: count unchanged at {count_after_first}[/green]") 1097 1047 1098 1048 # cleanup 1099 1049 client.delete(f"/deployments/{deployment_id}") ··· 1144 1094 log(f" deployment: {deployment_id}") 1145 1095 log(f" deployment params: {deployment.get('parameters')}") 1146 1096 1147 - # wait for scheduler to create runs 1148 - log("[bold]waiting for scheduler (7s)...[/bold]") 1149 - time_mod.sleep(7) 1150 - 1151 - # get the scheduled runs and check their parameters 1152 - resp = client.post("/flow_runs/filter", json={ 1153 - "flow_runs": {"deployment_id": {"any_": [deployment_id]}}, 1154 - "limit": 10, 1155 - }) 1156 - if resp.status_code != 200: return fail(f"filter flow_runs {resp.status_code}") 1157 - runs = resp.json() 1158 - log(f" found {len(runs)} runs") 1097 + # poll for scheduler to create runs (filter client-side since server filter doesn't support deployment_id yet) 1098 + log("[bold]polling for scheduler to create runs...[/bold]") 1099 + runs = [] 1100 + start = time_mod.time() 1101 + max_wait = 10.0 1102 + while time_mod.time() - start < max_wait: 1103 + resp = client.post("/flow_runs/filter", json={"limit": 100}) 1104 + if resp.status_code == 200: 1105 + all_runs = resp.json() 1106 + runs = [r for r in all_runs if r.get("deployment_id") == deployment_id] 1107 + if len(runs) > 0: 1108 + break 1109 + time_mod.sleep(0.5) 1110 + log(f" found {len(runs)} runs for deployment in {time_mod.time() - start:.1f}s") 1159 1111 1160 1112 if len(runs) == 0: 1161 - return fail("scheduler did not create any runs") 1113 + return fail("scheduler did not create any runs (timeout)") 1162 1114 1163 1115 # check merged parameters on first run 1164 1116 run_params = runs[0].get("parameters", {}) ··· 1272 1224 return True 1273 1225 1274 1226 1227 + def test_retry_failed_flows(client: CountingClient) -> bool: 1228 + """Test RetryFailedFlows rule - retry cycle when configured.""" 1229 + 1230 + def fail(msg: str) -> bool: 1231 + if not QUIET: console.print(f"[red]FAIL[/red]: {msg}") 1232 + return False 1233 + 1234 + def log(msg: str) -> None: 1235 + if not QUIET: console.print(msg) 1236 + 1237 + # setup 1238 + log("[bold]setup[/bold]") 1239 + resp = client.post("/flows/", json={"name": f"retry-flow-{uuid.uuid4().hex[:8]}"}) 1240 + if resp.status_code not in (200, 201): return fail(f"create flow {resp.status_code}") 1241 + flow_id = resp.json().get("id") 1242 + 1243 + # create flow run with retry configuration via empirical_policy 1244 + log("[bold]create flow run with retries=2, retry_delay=1[/bold]") 1245 + resp = client.post("/flow_runs/", json={ 1246 + "flow_id": flow_id, 1247 + "name": f"retry-run-{uuid.uuid4().hex[:8]}", 1248 + "state": {"type": "PENDING", "name": "Pending"}, 1249 + "empirical_policy": {"retries": 2, "retry_delay": 1}, 1250 + }) 1251 + if resp.status_code not in (200, 201): return fail(f"create flow run {resp.status_code}") 1252 + flow_run = resp.json() 1253 + flow_run_id = flow_run.get("id") 1254 + log(f" flow_run_id: {flow_run_id}") 1255 + 1256 + # run_count starts at 0, increments when entering RUNNING 1257 + # transition to RUNNING (run_count becomes 1) 1258 + log("[bold]PENDING → RUNNING (first attempt)[/bold]") 1259 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1260 + "state": {"type": "RUNNING", "name": "Running"}, 1261 + }) 1262 + if resp.status_code not in (200, 201): return fail(f"set RUNNING {resp.status_code}") 1263 + if resp.json().get("status") != "ACCEPT": 1264 + return fail(f"expected ACCEPT for RUNNING, got {resp.json().get('status')}") 1265 + 1266 + # verify run_count 1267 + resp = client.get(f"/flow_runs/{flow_run_id}") 1268 + if resp.status_code != 200: return fail(f"get flow run {resp.status_code}") 1269 + run_count = resp.json().get("run_count", 0) 1270 + log(f" run_count after RUNNING: {run_count}") 1271 + 1272 + # fail - with retries available, should REJECT and schedule retry 1273 + log("[bold]RUNNING → FAILED (expect REJECT + AwaitingRetry)[/bold]") 1274 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1275 + "state": {"type": "FAILED", "name": "Failed", "message": "Test failure"}, 1276 + }) 1277 + if resp.status_code not in (200, 201): return fail(f"set FAILED {resp.status_code}") 1278 + result = resp.json() 1279 + status = result.get("status") 1280 + log(f" status: {status}") 1281 + 1282 + # with retries, should get REJECT and the run should be in AwaitingRetry state 1283 + if status == "REJECT": 1284 + log(f" [green]REJECT (retries available)[/green]") 1285 + # verify state is now AwaitingRetry (SCHEDULED) 1286 + resp = client.get(f"/flow_runs/{flow_run_id}") 1287 + if resp.status_code != 200: return fail(f"get flow run {resp.status_code}") 1288 + actual_state = resp.json().get("state_type") 1289 + actual_name = resp.json().get("state_name") 1290 + log(f" state: {actual_state}/{actual_name}") 1291 + if actual_state != "SCHEDULED": 1292 + return fail(f"expected SCHEDULED (AwaitingRetry), got {actual_state}") 1293 + log(f" [green]state: SCHEDULED/AwaitingRetry (correct)[/green]") 1294 + elif status == "ACCEPT": 1295 + # this could happen if run_count > retries - check if that's the case 1296 + log(f" ACCEPT (retries may be exhausted)") 1297 + return True 1298 + else: 1299 + return fail(f"unexpected status: {status}") 1300 + 1301 + # complete the retry cycle: SCHEDULED → PENDING → RUNNING → COMPLETED 1302 + log("[bold]retry cycle: SCHEDULED → PENDING → RUNNING → COMPLETED[/bold]") 1303 + 1304 + # SCHEDULED → PENDING 1305 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1306 + "state": {"type": "PENDING", "name": "Pending"}, 1307 + }) 1308 + if resp.status_code not in (200, 201): return fail(f"retry PENDING {resp.status_code}") 1309 + log(f" → PENDING: {resp.json().get('status')}") 1310 + 1311 + # PENDING → RUNNING (run_count increments) 1312 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1313 + "state": {"type": "RUNNING", "name": "Running"}, 1314 + }) 1315 + if resp.status_code not in (200, 201): return fail(f"retry RUNNING {resp.status_code}") 1316 + log(f" → RUNNING: {resp.json().get('status')}") 1317 + 1318 + # verify run_count incremented 1319 + resp = client.get(f"/flow_runs/{flow_run_id}") 1320 + run_count = resp.json().get("run_count", 0) 1321 + log(f" run_count: {run_count}") 1322 + 1323 + # complete successfully 1324 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1325 + "state": {"type": "COMPLETED", "name": "Completed"}, 1326 + }) 1327 + if resp.status_code not in (200, 201): return fail(f"COMPLETED {resp.status_code}") 1328 + if resp.json().get("status") != "ACCEPT": 1329 + return fail(f"expected ACCEPT for COMPLETED, got {resp.json().get('status')}") 1330 + log(f" [green]→ COMPLETED: ACCEPT[/green]") 1331 + 1332 + return True 1333 + 1334 + 1335 + def test_cancellation_flow(client: CountingClient) -> bool: 1336 + """Test cancellation states (CANCELLING, CANCELLED).""" 1337 + 1338 + def fail(msg: str) -> bool: 1339 + if not QUIET: console.print(f"[red]FAIL[/red]: {msg}") 1340 + return False 1341 + 1342 + def log(msg: str) -> None: 1343 + if not QUIET: console.print(msg) 1344 + 1345 + # setup 1346 + log("[bold]setup[/bold]") 1347 + resp = client.post("/flows/", json={"name": f"cancel-flow-{uuid.uuid4().hex[:8]}"}) 1348 + if resp.status_code not in (200, 201): return fail(f"create flow {resp.status_code}") 1349 + flow_id = resp.json().get("id") 1350 + 1351 + resp = client.post("/flow_runs/", json={ 1352 + "flow_id": flow_id, 1353 + "name": f"cancel-run-{uuid.uuid4().hex[:8]}", 1354 + "state": {"type": "PENDING", "name": "Pending"}, 1355 + }) 1356 + if resp.status_code not in (200, 201): return fail(f"create flow run {resp.status_code}") 1357 + flow_run_id = resp.json().get("id") 1358 + log(f" flow_run_id: {flow_run_id}") 1359 + 1360 + # start the run 1361 + log("[bold]PENDING → RUNNING[/bold]") 1362 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1363 + "state": {"type": "RUNNING", "name": "Running"}, 1364 + }) 1365 + if resp.status_code not in (200, 201) or resp.json().get("status") != "ACCEPT": 1366 + return fail(f"RUNNING transition failed") 1367 + log(f" status: ACCEPT") 1368 + 1369 + # cancel while running - first CANCELLING 1370 + log("[bold]RUNNING → CANCELLING[/bold]") 1371 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1372 + "state": {"type": "CANCELLING", "name": "Cancelling"}, 1373 + }) 1374 + if resp.status_code not in (200, 201): return fail(f"CANCELLING {resp.status_code}") 1375 + if resp.json().get("status") != "ACCEPT": 1376 + return fail(f"expected ACCEPT for CANCELLING, got {resp.json().get('status')}") 1377 + log(f" [green]status: ACCEPT[/green]") 1378 + 1379 + # verify state 1380 + resp = client.get(f"/flow_runs/{flow_run_id}") 1381 + if resp.json().get("state_type") != "CANCELLING": 1382 + return fail(f"expected CANCELLING, got {resp.json().get('state_type')}") 1383 + 1384 + # complete cancellation 1385 + log("[bold]CANCELLING → CANCELLED[/bold]") 1386 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1387 + "state": {"type": "CANCELLED", "name": "Cancelled"}, 1388 + }) 1389 + if resp.status_code not in (200, 201): return fail(f"CANCELLED {resp.status_code}") 1390 + if resp.json().get("status") != "ACCEPT": 1391 + return fail(f"expected ACCEPT for CANCELLED, got {resp.json().get('status')}") 1392 + log(f" [green]status: ACCEPT[/green]") 1393 + 1394 + # verify final state 1395 + resp = client.get(f"/flow_runs/{flow_run_id}") 1396 + if resp.json().get("state_type") != "CANCELLED": 1397 + return fail(f"expected CANCELLED, got {resp.json().get('state_type')}") 1398 + log(f" [green]final state: CANCELLED[/green]") 1399 + 1400 + # CANCELLED is terminal - verify can't go back to PENDING (PreventPendingTransitions) 1401 + log("[bold]CANCELLED → PENDING (expect REJECT)[/bold]") 1402 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1403 + "state": {"type": "PENDING", "name": "Pending"}, 1404 + }) 1405 + if resp.status_code not in (200, 201): return fail(f"PENDING attempt {resp.status_code}") 1406 + if resp.json().get("status") != "REJECT": 1407 + return fail(f"expected REJECT for CANCELLED→PENDING, got {resp.json().get('status')}") 1408 + log(f" [green]CANCELLED → PENDING: REJECT (correct)[/green]") 1409 + 1410 + return True 1411 + 1412 + 1413 + def test_serve_pattern(client: CountingClient) -> bool: 1414 + """Test Runner/.serve() pattern: poll /deployments/get_scheduled_flow_runs, execute locally. 1415 + 1416 + This simulates what happens when you call flow.serve() - a Runner process polls 1417 + the deployments endpoint for scheduled runs and executes them in subprocesses. 1418 + """ 1419 + 1420 + def fail(msg: str) -> bool: 1421 + if not QUIET: console.print(f"[red]FAIL[/red]: {msg}") 1422 + return False 1423 + 1424 + def log(msg: str) -> None: 1425 + if not QUIET: console.print(msg) 1426 + 1427 + # setup: create flow and deployment (no work pool - serve doesn't use workers) 1428 + log("[bold]setup: flow + deployment (no work pool)[/bold]") 1429 + resp = client.post("/flows/", json={"name": f"serve-flow-{uuid.uuid4().hex[:8]}"}) 1430 + if resp.status_code not in (200, 201): return fail(f"create flow {resp.status_code}") 1431 + flow_id = resp.json().get("id") 1432 + 1433 + resp = client.post("/deployments/", json={ 1434 + "name": f"serve-deploy-{uuid.uuid4().hex[:8]}", 1435 + "flow_id": flow_id, 1436 + # no work_pool_name - this is the serve pattern 1437 + }) 1438 + if resp.status_code not in (200, 201): return fail(f"create deployment {resp.status_code}") 1439 + deployment = resp.json() 1440 + deployment_id = deployment.get("id") 1441 + log(f" deployment: {deployment_id}") 1442 + 1443 + # create a scheduled flow run 1444 + log("[bold]1. create scheduled flow run[/bold]") 1445 + resp = client.post(f"/deployments/{deployment_id}/create_flow_run", json={ 1446 + "state": {"type": "SCHEDULED", "name": "Scheduled"}, 1447 + }) 1448 + if resp.status_code not in (200, 201): return fail(f"create flow run {resp.status_code}") 1449 + flow_run = resp.json() 1450 + flow_run_id = flow_run.get("id") 1451 + log(f" flow_run: {flow_run_id}") 1452 + 1453 + # Runner polls: POST /deployments/get_scheduled_flow_runs 1454 + log("[bold]2. Runner polls /deployments/get_scheduled_flow_runs[/bold]") 1455 + from datetime import datetime, timezone 1456 + resp = client.post("/deployments/get_scheduled_flow_runs", json={ 1457 + "deployment_ids": [deployment_id], 1458 + "scheduled_before": datetime.now(timezone.utc).isoformat(), 1459 + "limit": 10, 1460 + }) 1461 + if resp.status_code != 200: return fail(f"poll {resp.status_code}") 1462 + scheduled = resp.json() 1463 + found = any(r.get("id") == flow_run_id for r in scheduled) 1464 + if not found: return fail("run not found in scheduled runs") 1465 + log(f" found {len(scheduled)} scheduled run(s)") 1466 + 1467 + # Runner executes: SCHEDULED → PENDING → RUNNING → COMPLETED 1468 + log("[bold]3. Runner executes flow locally[/bold]") 1469 + 1470 + # claim the run 1471 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1472 + "state": {"type": "PENDING", "name": "Pending"}, 1473 + }) 1474 + if resp.status_code not in (200, 201): return fail(f"PENDING {resp.status_code}") 1475 + log(f" → PENDING: {resp.json().get('status')}") 1476 + 1477 + # execute 1478 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1479 + "state": {"type": "RUNNING", "name": "Running"}, 1480 + }) 1481 + if resp.status_code not in (200, 201): return fail(f"RUNNING {resp.status_code}") 1482 + log(f" → RUNNING: {resp.json().get('status')}") 1483 + 1484 + # complete 1485 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1486 + "state": {"type": "COMPLETED", "name": "Completed"}, 1487 + }) 1488 + if resp.status_code not in (200, 201): return fail(f"COMPLETED {resp.status_code}") 1489 + if resp.json().get("status") != "ACCEPT": 1490 + return fail(f"expected ACCEPT, got {resp.json().get('status')}") 1491 + log(f" [green]→ COMPLETED: ACCEPT[/green]") 1492 + 1493 + # cleanup 1494 + client.delete(f"/deployments/{deployment_id}") 1495 + log(" cleanup: ok") 1496 + 1497 + return True 1498 + 1499 + 1500 + def test_worker_pattern(client: CountingClient) -> bool: 1501 + """Test Worker pattern: poll /work_pools/{name}/get_scheduled_flow_runs, dispatch to infra. 1502 + 1503 + This simulates what a Prefect Worker does - it polls a work pool for scheduled runs 1504 + and dispatches them to infrastructure (k8s, docker, etc). The worker doesn't execute 1505 + the flow itself; infrastructure does. 1506 + """ 1507 + 1508 + def fail(msg: str) -> bool: 1509 + if not QUIET: console.print(f"[red]FAIL[/red]: {msg}") 1510 + return False 1511 + 1512 + def log(msg: str) -> None: 1513 + if not QUIET: console.print(msg) 1514 + 1515 + # setup: flow + work pool + deployment with work_pool_name 1516 + log("[bold]setup: flow + work pool + deployment[/bold]") 1517 + resp = client.post("/flows/", json={"name": f"worker-flow-{uuid.uuid4().hex[:8]}"}) 1518 + if resp.status_code not in (200, 201): return fail(f"create flow {resp.status_code}") 1519 + flow_id = resp.json().get("id") 1520 + 1521 + pool_name = f"worker-pool-{uuid.uuid4().hex[:8]}" 1522 + resp = client.post("/work_pools/", json={"name": pool_name, "type": "kubernetes"}) 1523 + if resp.status_code not in (200, 201): return fail(f"create work_pool {resp.status_code}") 1524 + log(f" pool: {pool_name}") 1525 + 1526 + resp = client.post("/deployments/", json={ 1527 + "name": f"worker-deploy-{uuid.uuid4().hex[:8]}", 1528 + "flow_id": flow_id, 1529 + "work_pool_name": pool_name, # KEY: uses work pool 1530 + }) 1531 + if resp.status_code not in (200, 201): return fail(f"create deployment {resp.status_code}") 1532 + deployment_id = resp.json().get("id") 1533 + log(f" deployment: {deployment_id}") 1534 + 1535 + # worker registers via heartbeat 1536 + log("[bold]1. Worker sends heartbeat[/bold]") 1537 + resp = client.post(f"/work_pools/{pool_name}/workers/heartbeat", json={ 1538 + "name": "k8s-worker-1", 1539 + "heartbeat_interval_seconds": 30, 1540 + }) 1541 + if resp.status_code != 204: return fail(f"heartbeat {resp.status_code}") 1542 + log(f" heartbeat sent") 1543 + 1544 + # create a scheduled flow run 1545 + log("[bold]2. create scheduled flow run[/bold]") 1546 + resp = client.post(f"/deployments/{deployment_id}/create_flow_run", json={ 1547 + "state": {"type": "SCHEDULED", "name": "Scheduled"}, 1548 + }) 1549 + if resp.status_code not in (200, 201): return fail(f"create flow run {resp.status_code}") 1550 + flow_run_id = resp.json().get("id") 1551 + log(f" flow_run: {flow_run_id}") 1552 + 1553 + # Worker polls: POST /work_pools/{name}/get_scheduled_flow_runs 1554 + log("[bold]3. Worker polls /work_pools/{name}/get_scheduled_flow_runs[/bold]") 1555 + resp = client.post(f"/work_pools/{pool_name}/get_scheduled_flow_runs", json={}) 1556 + if resp.status_code != 200: return fail(f"poll {resp.status_code}") 1557 + scheduled = resp.json() 1558 + found = any(r.get("flow_run", {}).get("id") == flow_run_id for r in scheduled) 1559 + if not found: return fail("run not found in work pool queue") 1560 + log(f" found run in work pool queue") 1561 + 1562 + # Worker dispatches to infrastructure (simulated - infra reports back state) 1563 + log("[bold]4. Infrastructure executes and reports state[/bold]") 1564 + 1565 + # infra claims 1566 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1567 + "state": {"type": "PENDING", "name": "Pending"}, 1568 + }) 1569 + log(f" → PENDING: {resp.json().get('status')}") 1570 + 1571 + # infra runs 1572 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1573 + "state": {"type": "RUNNING", "name": "Running"}, 1574 + }) 1575 + log(f" → RUNNING: {resp.json().get('status')}") 1576 + 1577 + # infra completes 1578 + resp = client.post(f"/flow_runs/{flow_run_id}/set_state", json={ 1579 + "state": {"type": "COMPLETED", "name": "Completed"}, 1580 + }) 1581 + if resp.json().get("status") != "ACCEPT": 1582 + return fail(f"expected ACCEPT, got {resp.json().get('status')}") 1583 + log(f" [green]→ COMPLETED: ACCEPT[/green]") 1584 + 1585 + # cleanup 1586 + client.delete(f"/deployments/{deployment_id}") 1587 + client.delete(f"/work_pools/{pool_name}") 1588 + log(" cleanup: ok") 1589 + 1590 + return True 1591 + 1592 + 1593 + def test_work_queue_priority(client: CountingClient) -> bool: 1594 + """Test work queue priority ordering in get_scheduled_flow_runs.""" 1595 + 1596 + def fail(msg: str) -> bool: 1597 + if not QUIET: console.print(f"[red]FAIL[/red]: {msg}") 1598 + return False 1599 + 1600 + def log(msg: str) -> None: 1601 + if not QUIET: console.print(msg) 1602 + 1603 + # setup 1604 + log("[bold]setup: create pool with multiple queues[/bold]") 1605 + resp = client.post("/flows/", json={"name": f"priority-flow-{uuid.uuid4().hex[:8]}"}) 1606 + if resp.status_code not in (200, 201): return fail(f"create flow {resp.status_code}") 1607 + flow_id = resp.json().get("id") 1608 + 1609 + pool_name = f"priority-pool-{uuid.uuid4().hex[:8]}" 1610 + resp = client.post("/work_pools/", json={"name": pool_name, "type": "process"}) 1611 + if resp.status_code not in (200, 201): return fail(f"create pool {resp.status_code}") 1612 + default_queue_id = resp.json().get("default_queue_id") 1613 + log(f" pool: {pool_name} (default queue priority=1)") 1614 + 1615 + # create high-priority queue (lower number = higher priority) 1616 + resp = client.post(f"/work_pools/{pool_name}/queues/", json={ 1617 + "name": "high-priority", 1618 + "priority": 1, # highest priority 1619 + }) 1620 + if resp.status_code not in (200, 201): return fail(f"create high-priority queue {resp.status_code}") 1621 + high_queue_id = resp.json().get("id") 1622 + log(f" high-priority queue: {high_queue_id}") 1623 + 1624 + # create low-priority queue 1625 + resp = client.post(f"/work_pools/{pool_name}/queues/", json={ 1626 + "name": "low-priority", 1627 + "priority": 100, # lowest priority 1628 + }) 1629 + if resp.status_code not in (200, 201): return fail(f"create low-priority queue {resp.status_code}") 1630 + low_queue_id = resp.json().get("id") 1631 + log(f" low-priority queue: {low_queue_id}") 1632 + 1633 + # create deployments for each queue 1634 + deployments = {} 1635 + for queue_name in ["high-priority", "low-priority", "default"]: 1636 + resp = client.post("/deployments/", json={ 1637 + "name": f"deploy-{queue_name}-{uuid.uuid4().hex[:8]}", 1638 + "flow_id": flow_id, 1639 + "work_pool_name": pool_name, 1640 + "work_queue_name": queue_name, 1641 + }) 1642 + if resp.status_code not in (200, 201): return fail(f"create deployment {queue_name} {resp.status_code}") 1643 + deployments[queue_name] = resp.json().get("id") 1644 + log(f" deployment ({queue_name}): {deployments[queue_name]}") 1645 + 1646 + # create flow runs in each queue (order: low, default, high) 1647 + flow_runs = {} 1648 + log("[bold]create flow runs in reverse priority order[/bold]") 1649 + for queue_name in ["low-priority", "default", "high-priority"]: 1650 + resp = client.post(f"/deployments/{deployments[queue_name]}/create_flow_run", json={ 1651 + "state": {"type": "SCHEDULED", "name": "Scheduled"}, 1652 + }) 1653 + if resp.status_code not in (200, 201): return fail(f"create run for {queue_name} {resp.status_code}") 1654 + flow_runs[queue_name] = resp.json().get("id") 1655 + log(f" run ({queue_name}): {flow_runs[queue_name]}") 1656 + 1657 + # poll for scheduled runs 1658 + log("[bold]get_scheduled_flow_runs (expect high-priority first)[/bold]") 1659 + resp = client.post(f"/work_pools/{pool_name}/get_scheduled_flow_runs", json={}) 1660 + if resp.status_code != 200: return fail(f"poll {resp.status_code}") 1661 + scheduled = resp.json() 1662 + log(f" returned {len(scheduled)} runs") 1663 + 1664 + if len(scheduled) < 3: 1665 + return fail(f"expected at least 3 runs, got {len(scheduled)}") 1666 + 1667 + # verify order: high-priority first, then default, then low-priority 1668 + # the order in the response should match priority 1669 + run_ids = [r.get("flow_run", {}).get("id") for r in scheduled] 1670 + 1671 + # find positions 1672 + high_pos = run_ids.index(flow_runs["high-priority"]) if flow_runs["high-priority"] in run_ids else -1 1673 + default_pos = run_ids.index(flow_runs["default"]) if flow_runs["default"] in run_ids else -1 1674 + low_pos = run_ids.index(flow_runs["low-priority"]) if flow_runs["low-priority"] in run_ids else -1 1675 + 1676 + log(f" positions: high={high_pos}, default={default_pos}, low={low_pos}") 1677 + 1678 + if high_pos == -1 or default_pos == -1 or low_pos == -1: 1679 + return fail("not all runs found in results") 1680 + 1681 + # high-priority should come before low-priority 1682 + if high_pos > low_pos: 1683 + return fail(f"high-priority run should come before low-priority (positions: {high_pos} vs {low_pos})") 1684 + log(f" [green]priority ordering verified[/green]") 1685 + 1686 + # cleanup 1687 + for deploy_id in deployments.values(): 1688 + client.delete(f"/deployments/{deploy_id}") 1689 + client.delete(f"/work_pools/{pool_name}") 1690 + log(" cleanup: ok") 1691 + 1692 + return True 1693 + 1694 + 1275 1695 def main(): 1276 1696 json_output = "--json" in sys.argv 1277 1697 ··· 1281 1701 results: list[TestResult] = [] 1282 1702 1283 1703 # run all tests 1704 + # core API tests (fast, no sleeps) 1284 1705 results.append(run_test("admin", test_admin)) 1285 1706 results.append(run_test("flow_run (success)", lambda c: test_flow_run(c, should_fail=False))) 1286 1707 results.append(run_test("flow_run (failure)", lambda c: test_flow_run(c, should_fail=True))) 1287 - results.append(run_test("orchestration_rules", test_orchestration_rules)) 1708 + results.append(run_test("flow_with_task_runs", test_flow_with_task_runs)) 1288 1709 results.append(run_test("task_run", test_task_run)) 1289 1710 results.append(run_test("filters", test_filters)) 1290 1711 results.append(run_test("logs", test_logs)) ··· 1292 1713 results.append(run_test("blocks", test_blocks)) 1293 1714 results.append(run_test("work_pools", test_work_pools)) 1294 1715 results.append(run_test("deployments", test_deployments)) 1295 - results.append(run_test("get_scheduled_flow_runs", test_get_scheduled_flow_runs)) 1716 + 1717 + # execution patterns (two distinct models) 1718 + results.append(run_test("serve_pattern", test_serve_pattern)) # Runner: polls deployments 1719 + results.append(run_test("worker_pattern", test_worker_pattern)) # Worker: polls work pools 1720 + results.append(run_test("work_queue_priority", test_work_queue_priority)) 1721 + results.append(run_test("retry_failed_flows", test_retry_failed_flows)) 1722 + results.append(run_test("cancellation_flow", test_cancellation_flow)) 1723 + 1724 + # scheduler integration tests (require sleeps for background service) 1296 1725 results.append(run_test("scheduler_idempotency", test_scheduler_idempotency)) 1297 1726 results.append(run_test("parameter_merging", test_parameter_merging)) 1298 1727

+16 -1

src/api/flow_runs.zig

··· 276 276 .auto_scheduled = false, 277 277 .idempotency_key = null, 278 278 .empirical_policy = empirical_policy orelse "{}", 279 + .state_transition_id = null, 279 280 }; 280 281 281 282 const resp = writeFlowRun(alloc, run, state_id) catch { ··· 374 375 var state_id_buf: [36]u8 = undefined; 375 376 const state_id = uuid_util.generate(&state_id_buf); 376 377 378 + // parse state_details.transition_id for idempotency 379 + const proposed_transition_id: ?[]const u8 = blk: { 380 + const state_details = state.object.get("state_details") orelse break :blk null; 381 + if (state_details != .object) break :blk null; 382 + const tid = state_details.object.get("transition_id") orelse break :blk null; 383 + if (tid != .string) break :blk null; 384 + break :blk tid.string; 385 + }; 386 + 377 387 // get current run state for orchestration 378 388 const current_run = db.getFlowRun(alloc, id) catch null; 379 389 ··· 409 419 .proposed_state_timestamp = now, 410 420 // for CopyScheduledTime: pass scheduled_time from SCHEDULED state 411 421 .initial_scheduled_time = if (current_run) |run| run.next_scheduled_start_time else null, 422 + // for PreventDuplicateTransitions: pass transition_ids 423 + .initial_transition_id = if (current_run) |run| run.state_transition_id else null, 424 + .proposed_transition_id = proposed_transition_id, 412 425 .run_id = id, 413 426 .flow_id = if (current_run) |run| run.flow_id else null, 414 427 .deployment_id = if (current_run) |run| run.deployment_id else null, ··· 461 474 null, // expected_start_time 462 475 retry_scheduled, // next_scheduled_start_time 463 476 updated_policy, // empirical_policy updates 477 + proposed_transition_id, 464 478 ) catch { 465 479 json_util.sendStatus(r, "{\"detail\":\"update failed\"}", .internal_server_error); 466 480 return; ··· 538 552 rule_ctx.new_expected_start_time, 539 553 null, // next_scheduled_start_time 540 554 updated_policy, 555 + proposed_transition_id, 541 556 ) catch { 542 557 json_util.sendStatus(r, "{\"detail\":\"update failed\"}", .internal_server_error); 543 558 return; 544 559 }; 545 560 } else { 546 - db.setFlowRunState(id, state_id, state_type, state_name, now, bookkeeping_ctx.new_start_time, bookkeeping_ctx.new_end_time, bookkeeping_ctx.new_run_count, bookkeeping_ctx.new_total_run_time, rule_ctx.new_expected_start_time) catch { 561 + db.setFlowRunState(id, state_id, state_type, state_name, now, bookkeeping_ctx.new_start_time, bookkeeping_ctx.new_end_time, bookkeeping_ctx.new_run_count, bookkeeping_ctx.new_total_run_time, rule_ctx.new_expected_start_time, proposed_transition_id) catch { 547 562 json_util.sendStatus(r, "{\"detail\":\"update failed\"}", .internal_server_error); 548 563 return; 549 564 };

+25 -15

src/db/flow_runs.zig

··· 31 31 idempotency_key: ?[]const u8, 32 32 // retry policy (empirical_policy JSON) 33 33 empirical_policy: []const u8, 34 + // idempotency: transition_id from current state 35 + state_transition_id: ?[]const u8, 34 36 }; 35 37 36 38 pub const InsertParams = struct { ··· 139 141 var rows = backend.db.query( 140 142 \\SELECT id, created, updated, flow_id, name, state_type, state_name, state_timestamp, 141 143 \\ parameters, tags, run_count, expected_start_time, next_scheduled_start_time, start_time, end_time, total_run_time, 142 - \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy 144 + \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy, state_transition_id 143 145 \\FROM flow_run WHERE id = ? 144 146 , .{id}) catch return null; 145 147 defer rows.deinit(); ··· 161 163 run_count: i64, 162 164 total_run_time: f64, 163 165 expected_start_time: ?[]const u8, 166 + state_transition_id: ?[]const u8, 164 167 ) !void { 165 168 // Lock mutex only for SQLite (postgres pool handles concurrency) 166 169 if (backend.db.dialect == .sqlite) { ··· 182 185 if (expected_start_time) |est| { 183 186 txn.exec( 184 187 \\UPDATE flow_run SET state_id = ?, state_type = ?, state_name = ?, state_timestamp = ?, updated = ?, 185 - \\ start_time = ?, end_time = ?, run_count = ?, total_run_time = ?, expected_start_time = ? 188 + \\ start_time = ?, end_time = ?, run_count = ?, total_run_time = ?, expected_start_time = ?, state_transition_id = ? 186 189 \\WHERE id = ? 187 190 , .{ 188 - state_id, state_type, state_name, timestamp, timestamp, 189 - start_time, end_time, run_count, total_run_time, est, 190 - run_id, 191 + state_id, state_type, state_name, timestamp, timestamp, 192 + start_time, end_time, run_count, total_run_time, est, 193 + state_transition_id, run_id, 191 194 }) catch |err| { 192 195 log.err("database", "update flow_run error: {}", .{err}); 193 196 return err; ··· 195 198 } else { 196 199 txn.exec( 197 200 \\UPDATE flow_run SET state_id = ?, state_type = ?, state_name = ?, state_timestamp = ?, updated = ?, 198 - \\ start_time = ?, end_time = ?, run_count = ?, total_run_time = ? 201 + \\ start_time = ?, end_time = ?, run_count = ?, total_run_time = ?, state_transition_id = ? 199 202 \\WHERE id = ? 200 203 , .{ 201 204 state_id, state_type, state_name, timestamp, timestamp, 202 - start_time, end_time, run_count, total_run_time, run_id, 205 + start_time, end_time, run_count, total_run_time, state_transition_id, 206 + run_id, 203 207 }) catch |err| { 204 208 log.err("database", "update flow_run error: {}", .{err}); 205 209 return err; ··· 235 239 expected_start_time: ?[]const u8, 236 240 next_scheduled_start_time: ?[]const u8, 237 241 empirical_policy: ?[]const u8, 242 + state_transition_id: ?[]const u8, 238 243 ) !void { 239 244 if (backend.db.dialect == .sqlite) { 240 245 backend.db.mutex.lock(); ··· 255 260 \\ start_time = ?, end_time = ?, run_count = ?, total_run_time = ?, 256 261 \\ expected_start_time = COALESCE(?, expected_start_time), 257 262 \\ next_scheduled_start_time = ?, 258 - \\ empirical_policy = ? 263 + \\ empirical_policy = ?, 264 + \\ state_transition_id = ? 259 265 \\WHERE id = ? 260 266 , .{ 261 267 state_id, ··· 270 276 expected_start_time, 271 277 next_scheduled_start_time, 272 278 policy, 279 + state_transition_id, 273 280 run_id, 274 281 }) catch |err| { 275 282 log.err("database", "update flow_run error: {}", .{err}); ··· 280 287 \\UPDATE flow_run SET state_id = ?, state_type = ?, state_name = ?, state_timestamp = ?, updated = ?, 281 288 \\ start_time = ?, end_time = ?, run_count = ?, total_run_time = ?, 282 289 \\ expected_start_time = COALESCE(?, expected_start_time), 283 - \\ next_scheduled_start_time = ? 290 + \\ next_scheduled_start_time = ?, 291 + \\ state_transition_id = ? 284 292 \\WHERE id = ? 285 293 , .{ 286 294 state_id, ··· 294 302 total_run_time, 295 303 expected_start_time, 296 304 next_scheduled_start_time, 305 + state_transition_id, 297 306 run_id, 298 307 }) catch |err| { 299 308 log.err("database", "update flow_run error: {}", .{err}); ··· 322 331 var rows = backend.db.query( 323 332 \\SELECT id, created, updated, flow_id, name, state_type, state_name, state_timestamp, 324 333 \\ parameters, tags, run_count, expected_start_time, next_scheduled_start_time, start_time, end_time, total_run_time, 325 - \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy 334 + \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy, state_transition_id 326 335 \\FROM flow_run ORDER BY created DESC LIMIT ? 327 336 , .{@as(i64, @intCast(limit))}) catch |err| { 328 337 log.err("database", "list flow_runs error: {}", .{err}); ··· 345 354 var rows = backend.db.query( 346 355 \\SELECT id, created, updated, flow_id, name, state_type, state_name, state_timestamp, 347 356 \\ parameters, tags, run_count, expected_start_time, next_scheduled_start_time, start_time, end_time, total_run_time, 348 - \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy 357 + \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy, state_transition_id 349 358 \\FROM flow_run WHERE deployment_id = ? AND state_type = 'SCHEDULED' 350 359 \\ORDER BY expected_start_time ASC LIMIT ? 351 360 , .{ deployment_id, @as(i64, @intCast(limit)) }) catch |err| { ··· 417 426 var rows = backend.db.query( 418 427 \\SELECT id, created, updated, flow_id, name, state_type, state_name, state_timestamp, 419 428 \\ parameters, tags, run_count, expected_start_time, next_scheduled_start_time, start_time, end_time, total_run_time, 420 - \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy 429 + \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy, state_transition_id 421 430 \\FROM flow_run WHERE deployment_id = ? AND state_type = 'SCHEDULED' 422 431 \\ AND (next_scheduled_start_time IS NULL OR next_scheduled_start_time <= ?) 423 432 \\ORDER BY next_scheduled_start_time ASC LIMIT ? ··· 434 443 var rows = backend.db.query( 435 444 \\SELECT id, created, updated, flow_id, name, state_type, state_name, state_timestamp, 436 445 \\ parameters, tags, run_count, expected_start_time, next_scheduled_start_time, start_time, end_time, total_run_time, 437 - \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy 446 + \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy, state_transition_id 438 447 \\FROM flow_run WHERE deployment_id = ? AND state_type = 'SCHEDULED' 439 448 \\ORDER BY next_scheduled_start_time ASC LIMIT ? 440 449 , .{ deployment_id, @as(i64, @intCast(limit)) }) catch |err| { ··· 466 475 var rows = backend.db.query( 467 476 \\SELECT id, created, updated, flow_id, name, state_type, state_name, state_timestamp, 468 477 \\ parameters, tags, run_count, expected_start_time, next_scheduled_start_time, start_time, end_time, total_run_time, 469 - \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy 478 + \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy, state_transition_id 470 479 \\FROM flow_run WHERE work_queue_id = ? AND state_type = 'SCHEDULED' 471 480 \\ AND (next_scheduled_start_time IS NULL OR next_scheduled_start_time <= ?) 472 481 \\ORDER BY next_scheduled_start_time ASC LIMIT ? ··· 483 492 var rows = backend.db.query( 484 493 \\SELECT id, created, updated, flow_id, name, state_type, state_name, state_timestamp, 485 494 \\ parameters, tags, run_count, expected_start_time, next_scheduled_start_time, start_time, end_time, total_run_time, 486 - \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy 495 + \\ deployment_id, deployment_version, work_queue_name, work_queue_id, auto_scheduled, idempotency_key, empirical_policy, state_transition_id 487 496 \\FROM flow_run WHERE work_queue_id = ? AND state_type = 'SCHEDULED' 488 497 \\ORDER BY next_scheduled_start_time ASC LIMIT ? 489 498 , .{ work_queue_id, @as(i64, @intCast(limit)) }) catch |err| { ··· 566 575 .auto_scheduled = r.int(20) != 0, 567 576 .idempotency_key = if (r.text(21).len > 0) alloc.dupe(u8, r.text(21)) catch null else null, 568 577 .empirical_policy = alloc.dupe(u8, r.text(22)) catch "{}", 578 + .state_transition_id = if (r.text(23).len > 0) alloc.dupe(u8, r.text(23)) catch null else null, 569 579 }; 570 580 } 571 581

+4

src/db/migrations/002_state_transition_id/postgres.sql

··· 1 + -- 002_state_transition_id: add transition_id for idempotency 2 + -- used by PreventDuplicateTransitions rule 3 + 4 + ALTER TABLE flow_run ADD COLUMN IF NOT EXISTS state_transition_id TEXT;

+4

src/db/migrations/002_state_transition_id/sqlite.sql

··· 1 + -- 002_state_transition_id: add transition_id for idempotency 2 + -- used by PreventDuplicateTransitions rule 3 + 4 + ALTER TABLE flow_run ADD COLUMN state_transition_id TEXT;

+5 -6

src/db/migrations_data.zig

··· 17 17 .sqlite_sql = @embedFile("migrations/001_initial/sqlite.sql"), 18 18 .postgres_sql = @embedFile("migrations/001_initial/postgres.sql"), 19 19 }, 20 - // add new migrations here: 21 - // .{ 22 - // .id = "002_add_feature", 23 - // .sqlite_sql = @embedFile("migrations/002_add_feature/sqlite.sql"), 24 - // .postgres_sql = @embedFile("migrations/002_add_feature/postgres.sql"), 25 - // }, 20 + .{ 21 + .id = "002_state_transition_id", 22 + .sqlite_sql = @embedFile("migrations/002_state_transition_id/sqlite.sql"), 23 + .postgres_sql = @embedFile("migrations/002_state_transition_id/postgres.sql"), 24 + }, 26 25 };

+140 -2

src/orchestration/flow_rules.zig

··· 150 150 } 151 151 152 152 // ============================================================================ 153 + // PreventDuplicateTransitions 154 + // ============================================================================ 155 + 156 + /// PreventDuplicateTransitions: idempotency guard for state transitions 157 + /// 158 + /// this rule prevents duplicate state transitions by comparing transition_id 159 + /// from the current state against the proposed state's transition_id. 160 + /// 161 + /// if both are set and equal, the transition is rejected - this indicates 162 + /// the client is retrying a request that already succeeded. 163 + /// 164 + /// workflow: 165 + /// 1. client sends state transition with transition_id in state_details 166 + /// 2. server stores transition_id when state is committed 167 + /// 3. if client retries the same request, transition_id will match 168 + /// 4. rule rejects duplicate to return the existing state 169 + pub const PreventDuplicateTransitions = OrchestrationRule{ 170 + .name = "PreventDuplicateTransitions", 171 + .from_states = StateTypeSet.ALL.withNull(), 172 + .to_states = StateTypeSet.ALL, 173 + .before_transition = preventDuplicateTransitionsFn, 174 + }; 175 + 176 + fn preventDuplicateTransitionsFn(ctx: *RuleContext) void { 177 + const initial_tid = ctx.initial_transition_id orelse return; 178 + const proposed_tid = ctx.proposed_transition_id orelse return; 179 + 180 + if (std.mem.eql(u8, initial_tid, proposed_tid)) { 181 + ctx.reject("This run has already made this state transition."); 182 + } 183 + } 184 + 185 + // ============================================================================ 153 186 // CoreFlowPolicy 154 187 // ============================================================================ 155 188 156 189 /// CoreFlowPolicy: ordered list of rules for flow run state transitions 157 190 pub const CoreFlowPolicy = [_]OrchestrationRule{ 191 + PreventDuplicateTransitions, // idempotency check first 158 192 PreventPendingTransitions, 159 193 CopyScheduledTime, 160 194 WaitForScheduledTime, 161 195 RetryFailedFlows, 162 196 // future rules will be added here in priority order: 163 - // PreventDuplicateTransitions, 164 197 // HandleFlowTerminalStateTransitions, 165 - // etc. 198 + // HandlePausingFlows, 199 + // HandleResumingPausedFlows, 166 200 }; 167 201 168 202 // ============================================================================ ··· 601 635 try testing.expect(!ctx.set_resuming_false); 602 636 try testing.expect(!ctx.clear_pause_keys); 603 637 } 638 + 639 + // ============================================================================ 640 + // PreventDuplicateTransitions Tests 641 + // ============================================================================ 642 + 643 + test "PreventDuplicateTransitions.appliesTo" { 644 + const testing = std.testing; 645 + 646 + // applies to any state → any state 647 + try testing.expect(PreventDuplicateTransitions.appliesTo(.PENDING, .RUNNING)); 648 + try testing.expect(PreventDuplicateTransitions.appliesTo(.RUNNING, .COMPLETED)); 649 + try testing.expect(PreventDuplicateTransitions.appliesTo(.SCHEDULED, .PENDING)); 650 + try testing.expect(PreventDuplicateTransitions.appliesTo(null, .PENDING)); 651 + } 652 + 653 + test "PreventDuplicateTransitions allows when no transition_ids" { 654 + const testing = std.testing; 655 + 656 + var ctx = RuleContext{ 657 + .initial_state = .PENDING, 658 + .proposed_state = .RUNNING, 659 + .initial_state_timestamp = "2024-01-19T16:30:00Z", 660 + .proposed_state_timestamp = "2024-01-19T16:30:01Z", 661 + .run_id = "test-run-id", 662 + // no transition_ids 663 + }; 664 + 665 + rules.applyPolicy(&CoreFlowPolicy, &ctx); 666 + 667 + try testing.expectEqual(rules.ResponseStatus.ACCEPT, ctx.result.status); 668 + } 669 + 670 + test "PreventDuplicateTransitions allows when only initial_transition_id set" { 671 + const testing = std.testing; 672 + 673 + var ctx = RuleContext{ 674 + .initial_state = .PENDING, 675 + .proposed_state = .RUNNING, 676 + .initial_state_timestamp = "2024-01-19T16:30:00Z", 677 + .proposed_state_timestamp = "2024-01-19T16:30:01Z", 678 + .run_id = "test-run-id", 679 + .initial_transition_id = "abc123", 680 + .proposed_transition_id = null, 681 + }; 682 + 683 + rules.applyPolicy(&CoreFlowPolicy, &ctx); 684 + 685 + try testing.expectEqual(rules.ResponseStatus.ACCEPT, ctx.result.status); 686 + } 687 + 688 + test "PreventDuplicateTransitions allows when only proposed_transition_id set" { 689 + const testing = std.testing; 690 + 691 + var ctx = RuleContext{ 692 + .initial_state = .PENDING, 693 + .proposed_state = .RUNNING, 694 + .initial_state_timestamp = "2024-01-19T16:30:00Z", 695 + .proposed_state_timestamp = "2024-01-19T16:30:01Z", 696 + .run_id = "test-run-id", 697 + .initial_transition_id = null, 698 + .proposed_transition_id = "abc123", 699 + }; 700 + 701 + rules.applyPolicy(&CoreFlowPolicy, &ctx); 702 + 703 + try testing.expectEqual(rules.ResponseStatus.ACCEPT, ctx.result.status); 704 + } 705 + 706 + test "PreventDuplicateTransitions allows when transition_ids differ" { 707 + const testing = std.testing; 708 + 709 + var ctx = RuleContext{ 710 + .initial_state = .PENDING, 711 + .proposed_state = .RUNNING, 712 + .initial_state_timestamp = "2024-01-19T16:30:00Z", 713 + .proposed_state_timestamp = "2024-01-19T16:30:01Z", 714 + .run_id = "test-run-id", 715 + .initial_transition_id = "abc123", 716 + .proposed_transition_id = "def456", // different 717 + }; 718 + 719 + rules.applyPolicy(&CoreFlowPolicy, &ctx); 720 + 721 + try testing.expectEqual(rules.ResponseStatus.ACCEPT, ctx.result.status); 722 + } 723 + 724 + test "PreventDuplicateTransitions rejects when transition_ids match" { 725 + const testing = std.testing; 726 + 727 + var ctx = RuleContext{ 728 + .initial_state = .RUNNING, 729 + .proposed_state = .COMPLETED, 730 + .initial_state_timestamp = "2024-01-19T16:30:00Z", 731 + .proposed_state_timestamp = "2024-01-19T16:30:01Z", 732 + .run_id = "test-run-id", 733 + .initial_transition_id = "abc123", 734 + .proposed_transition_id = "abc123", // same! 735 + }; 736 + 737 + rules.applyPolicy(&CoreFlowPolicy, &ctx); 738 + 739 + try testing.expectEqual(rules.ResponseStatus.REJECT, ctx.result.status); 740 + try testing.expect(ctx.result.details.reason != null); 741 + }

+4

src/orchestration/rules.zig

··· 24 24 initial_scheduled_time: ?[]const u8 = null, // next_scheduled_start_time from SCHEDULED state 25 25 proposed_scheduled_time: ?[]const u8 = null, // scheduled_time from proposed state (if any) 26 26 27 + // idempotency (for PreventDuplicateTransitions) 28 + initial_transition_id: ?[]const u8 = null, // transition_id from current state 29 + proposed_transition_id: ?[]const u8 = null, // transition_id from proposed state 30 + 27 31 // run metadata (for rules that need it) 28 32 run_id: []const u8, 29 33 flow_id: ?[]const u8 = null,