fixing discrepancies · nonbinary.computer/or1-design@597e38f

+14 -14

CLAUDE.md

··· 50 50 51 51 ## Project Structure 52 52 53 - - `cm_inst.py` — Instruction set definitions (ALUOp hierarchy, ALUInst, SMInst, Addr) 54 - - `tokens.py` — Token type hierarchy (Token -> CMToken -> DyadToken/MonadToken; SMToken, CfgToken -> LoadInstToken/RouteSetToken, IOToken) 53 + - `cm_inst.py` — Instruction set definitions (Port, MemOp, CfgOp, ALUOp hierarchy, ALUInst, SMInst, Addr) 54 + - `tokens.py` — Token type hierarchy (Token -> CMToken -> DyadToken/MonadToken; SMToken, CfgToken -> LoadInstToken/RouteSetToken, IOToken). Imports ISA enums from cm_inst. 55 55 - `sm_mod.py` — Structure Memory cell model (Presence enum, SMCell dataclass, StructureMem resource) 56 56 - `dfasm.lark` — Lark grammar for dfasm graph assembly language 57 57 - `emu/` — Behavioural emulator package (SimPy-based discrete event simulation) ··· 102 102 - `SMToken(Token)` -- `addr: int`, `op: MemOp`, `flags`, `data`, `ret: Optional[CMToken]` 103 103 - `SysToken(Token)` -- `addr: Optional[int]` 104 104 - `CfgToken(SysToken)` -- `op: CfgOp` (base class, no payload) 105 - - `LoadInstToken(CfgToken)` -- `instructions: tuple[ALUInst | SMInst, ...]` 106 - - `RouteSetToken(CfgToken)` -- `pe_routes: tuple[int, ...]`, `sm_routes: tuple[int, ...]` 105 + - `LoadInstToken(CfgToken)` -- `instructions: tuple[ALUInst | SMInst, ...]` (contiguous block from `addr`) 106 + - `RouteSetToken(CfgToken)` -- `pe_routes: frozenset[int]`, `sm_routes: frozenset[int]` 107 107 - `IOToken(SysToken)` -- `data: Optional[List[int]]` 108 108 109 109 ### Instruction Set (cm_inst.py) ··· 134 134 - Generation counter (`gen_counters[ctx]`): stale tokens (gen mismatch) are discarded 135 135 136 136 **Output routing modes** (determined by `_output_mode()`): 137 - - `SUPPRESS` -- FREE op, or GATE with `bool_out=False`, or no dest_l 137 + - `SUPPRESS` -- FREE_CTX op, or GATE with `bool_out=False`, or no dest_l 138 138 - `SINGLE` -- dest_l only (no dest_r) 139 139 - `DUAL` -- both dest_l and dest_r (non-switch) 140 140 - `SWITCH` -- SW* routing ops: `bool_out=True` sends data to dest_l, trigger to dest_r; vice versa ··· 185 185 186 186 ### Module Dependency Graph 187 187 188 - Root-level modules (`cm_inst.py`, `tokens.py`, `sm_mod.py`) define the ISA and token types. The `emu/` package imports from root-level modules but root-level modules never import from `emu/`. The `asm/` package imports from both root-level modules and `emu/types.py` (for PEConfig/SMConfig), but neither root-level modules nor `emu/` import from `asm/`. 188 + `cm_inst.py` defines ISA enums and instruction types (no dependencies). `tokens.py` imports from `cm_inst.py` and defines the token hierarchy. `sm_mod.py` is independent. The `emu/` package imports from root-level modules but root-level modules never import from `emu/`. The `asm/` package imports from both root-level modules and `emu/types.py` (for PEConfig/SMConfig), but neither root-level modules nor `emu/` import from `asm/`. 189 189 190 190 ``` 191 - tokens.py <-- cm_inst.py <-- emu/types.py 192 - ^ | | 193 - | v v 194 - sm_mod.py emu/alu.py emu/pe.py <--> emu/sm.py 195 - | \ / 196 - | emu/network.py 197 - | ^ 198 - v | 191 + cm_inst.py <-- tokens.py <-- emu/types.py 192 + | | | 193 + v v v 194 + emu/alu.py sm_mod.py emu/pe.py <--> emu/sm.py 195 + \ / 196 + emu/network.py 197 + ^ 198 + | 199 199 asm/ir.py <-- asm/opcodes.py asm/codegen.py 200 200 | | | 201 201 v v v

+2 -2

asm/CLAUDE.md

··· 22 22 23 23 ## Dependencies 24 24 25 - - **Uses**: `cm_inst` (ALUOp, ALUInst, SMInst, Addr, MemOp), `tokens` (Port, MonadToken, SMToken, CfgToken, CfgOp, MemOp), `sm_mod` (Presence), `emu/types` (PEConfig, SMConfig), `lark` (parser) 25 + - **Uses**: `cm_inst` (Port, MemOp, CfgOp, ALUOp, ALUInst, SMInst, Addr), `tokens` (MonadToken, SMToken, CfgToken, LoadInstToken, RouteSetToken), `sm_mod` (Presence), `emu/types` (PEConfig, SMConfig), `lark` (parser) 26 26 - **Used by**: Test suite, user programs 27 27 - **Boundary**: `emu/` and root-level modules must NEVER import from `asm/` 28 28 ··· 50 50 ## Gotchas 51 51 52 52 - `MemOp.WRITE` arity depends on const: monadic when const is set (cell_addr from const), dyadic when const is None (cell_addr from left operand) 53 - - `RoutingOp.FREE` (ALU free) and `MemOp.FREE` (SM free) share the name "free" -- assembler uses `free` for ALU and `free_sm` for SM to disambiguate 53 + - `RoutingOp.FREE_CTX` (ALU context deallocation) and `MemOp.FREE` (SM free) are disambiguated by mnemonic: assembler uses `free_ctx` for ALU and `free` for SM 54 54 55 55

+4 -6

asm/codegen.py

··· 15 15 IRGraph, IRNode, IREdge, ResolvedDest, collect_all_nodes_and_edges, collect_all_data_defs, 16 16 DEFAULT_IRAM_CAPACITY, DEFAULT_CTX_SLOTS 17 17 ) 18 - from cm_inst import ALUInst, SMInst, MemOp, RoutingOp 18 + from cm_inst import ALUInst, CfgOp, MemOp, RoutingOp, SMInst 19 19 from emu.types import PEConfig, SMConfig 20 - from tokens import MonadToken, SMToken, CfgOp, LoadInstToken, RouteSetToken 20 + from tokens import LoadInstToken, MonadToken, RouteSetToken, SMToken 21 21 from sm_mod import Presence 22 22 23 23 ··· 252 252 253 253 # 2. ROUTE_SET tokens 254 254 for pe_config in result.pe_configs: 255 - pe_routes = sorted(pe_config.allowed_pe_routes or []) 256 - sm_routes = sorted(pe_config.allowed_sm_routes or []) 257 255 token = RouteSetToken( 258 256 target=pe_config.pe_id, 259 257 addr=None, 260 258 op=CfgOp.ROUTE_SET, 261 - pe_routes=tuple(pe_routes), 262 - sm_routes=tuple(sm_routes), 259 + pe_routes=frozenset(pe_config.allowed_pe_routes or ()), 260 + sm_routes=frozenset(pe_config.allowed_sm_routes or ()), 263 261 ) 264 262 tokens.append(token) 265 263

+3 -4

asm/ir.py

··· 11 11 from enum import Enum 12 12 from typing import TYPE_CHECKING, Iterator, Optional, Union 13 13 14 - from cm_inst import ALUOp, Addr, MemOp 15 - from tokens import Port 14 + from cm_inst import ALUOp, Addr, MemOp, Port 16 15 17 16 if TYPE_CHECKING: 18 17 from asm.errors import AssemblyError 19 18 20 19 # Default configuration values for system parameters 21 - DEFAULT_IRAM_CAPACITY = 64 22 - DEFAULT_CTX_SLOTS = 4 20 + DEFAULT_IRAM_CAPACITY = 128 21 + DEFAULT_CTX_SLOTS = 16 23 22 24 23 25 24 @dataclass(frozen=True)

+1 -2

asm/lower.py

··· 21 21 ) 22 22 from asm.errors import AssemblyError, ErrorCategory 23 23 from asm.opcodes import MNEMONIC_TO_OP 24 - from cm_inst import ALUOp, MemOp 25 - from tokens import Port, CfgOp 24 + from cm_inst import ALUOp, CfgOp, MemOp, Port 26 25 27 26 # Reserved names that cannot be used as node definitions 28 27 _RESERVED_NAMES = frozenset({"@system", "@io", "@debug"})

+4 -5

asm/opcodes.py

··· 8 8 """ 9 9 10 10 from typing import Optional, Union 11 - from cm_inst import ArithOp, LogicOp, RoutingOp, is_monadic_alu 12 - from tokens import MemOp, CfgOp 11 + from cm_inst import ArithOp, CfgOp, LogicOp, MemOp, RoutingOp, is_monadic_alu 13 12 14 13 15 14 # Build mnemonic to opcode mapping ··· 46 45 "merge": RoutingOp.MRGE, 47 46 "pass": RoutingOp.PASS, 48 47 "const": RoutingOp.CONST, 49 - "free": RoutingOp.FREE, # ALU free (distinct from SM free_sm) 48 + "free_ctx": RoutingOp.FREE_CTX, # ALU free (deallocate context slot) 50 49 # Memory operations 51 50 "read": MemOp.READ, 52 51 "write": MemOp.WRITE, 53 52 "clear": MemOp.CLEAR, 54 53 "alloc": MemOp.ALLOC, 55 - "free_sm": MemOp.FREE, # SM free (distinct from ALU free) 54 + "free": MemOp.FREE, # SM free 56 55 "rd_inc": MemOp.RD_INC, 57 56 "rd_dec": MemOp.RD_DEC, 58 57 "cmp_sw": MemOp.CMP_SW, ··· 143 142 # Routing: single input or no ALU involvement 144 143 (RoutingOp, int(RoutingOp.PASS)), 145 144 (RoutingOp, int(RoutingOp.CONST)), 146 - (RoutingOp, int(RoutingOp.FREE)), 145 + (RoutingOp, int(RoutingOp.FREE_CTX)), 147 146 # Memory: single input (monadic SM operations) 148 147 (MemOp, int(MemOp.READ)), 149 148 (MemOp, int(MemOp.ALLOC)),

+1 -1

asm/serialize.py

··· 20 20 IRGraph, IRNode, IREdge, IRRegion, RegionKind, IRDataDef 21 21 ) 22 22 from asm.opcodes import OP_TO_MNEMONIC 23 - from tokens import Port 23 + from cm_inst import Port 24 24 25 25 26 26 def serialize(graph: IRGraph) -> str:

+25 -3

cm_inst.py

··· 1 1 from dataclasses import dataclass 2 2 from enum import IntEnum 3 - from tokens import Port, MemOp 4 3 from typing import Optional 5 4 6 5 6 + class Port(IntEnum): 7 + L = 0 8 + R = 1 9 + 10 + 11 + class MemOp(IntEnum): 12 + READ = 0b000 13 + WRITE = 0b001 14 + ALLOC = 0b011 15 + FREE = 0b100 16 + CLEAR = 0b101 17 + # reserved 18 + RD_INC = 0b1100 19 + RD_DEC = 0b1101 20 + CMP_SW = 0b1110 21 + # reserved 22 + 23 + 24 + class CfgOp(IntEnum): 25 + LOAD_INST = 0 26 + ROUTE_SET = 1 27 + 28 + 7 29 class ALUOp(IntEnum): 8 30 pass 9 31 ··· 44 66 GATE = 0b11100000 45 67 PASS = 0b01000000 46 68 CONST = 0b0010000 47 - FREE = 0b1010000 69 + FREE_CTX = 0b1010000 48 70 # uncertain 49 71 SEL = 0b11110000 50 72 MRGE = 0b11111000 ··· 107 129 _MONADIC_ROUTING_OPS = frozenset({ 108 130 RoutingOp.PASS, 109 131 RoutingOp.CONST, 110 - RoutingOp.FREE, 132 + RoutingOp.FREE_CTX, 111 133 }) 112 134 113 135

-301

design-notes/versions/architectural-positioning.md

··· 1 - # Dynamic Dataflow CPU — Architectural Positioning & Research Notes 2 - 3 - Working notes capturing design philosophy decisions, research insights, 4 - and prior art observations. Not a design spec — a reference for "why we 5 - chose this direction" and "what we learned from the literature." 6 - 7 - ## Companion Documents 8 - 9 - - `architecture-overview.md` — master architecture reference 10 - - `pe-design.md` — PE pipeline, matching store, context slots 11 - - `design-alternatives.md` — rejected/deferred approaches 12 - - `Prior_Art_Reference_Guide_for_a_Discrete-Logic_Dynamic_Dataflow_CPU.md` 13 - — comprehensive bibliography 14 - 15 - --- 16 - 17 - ## 1. Core Architectural Commitment: Pure Dataflow, Not Hybrid 18 - 19 - This project is a **dynamic dataflow machine**, not a multithreaded RISC 20 - core with dataflow-style synchronisation primitives. 21 - 22 - The MIT lineage went: Manchester → TTDA → Monsoon → *T → Sparcle, with 23 - each step making the PE more like a conventional CPU and reducing the 24 - dataflow aspects to synchronisation mechanisms (presence bits on memory, 25 - fast context switching, message passing). By *T (1992), the "dataflow" 26 - part is essentially hardware semaphores bolted onto a modified SPARC. 27 - 28 - **We are not going down that road.** The point of this project is the 29 - different execution model — where synchronisation is implicit in the 30 - data flow, not explicit in the program. A PE that needs a program counter, 31 - register file, bypass network, and branch prediction is solving a 32 - different problem than we're solving. 33 - 34 - Specific non-goals: 35 - - Sequential instruction streams within a PE (*T, Sparcle, EARTH) 36 - - Register files as primary operand storage 37 - - Program counter / sequential fetch logic 38 - - Branch prediction hardware 39 - - Hardware semaphores / presence-bit memory traps (*T model) 40 - - "Make each PE a full CPU" — this blows the transistor budget from 41 - 4 PEs down to 1-2, losing the parallelism that's the whole point 42 - 43 - ### Where We Sit on the Spectrum 44 - 45 - ``` 46 - Pure dataflow ←————————————————————————→ Pure von Neumann 47 - Manchester Monsoon *T/Sparcle OoO superscalar 48 - | | | | 49 - | [this project] | | 50 - | | | | 51 - hash matching ETS/direct RISC+sync register renaming 52 - no PC no PC has PC has PC 53 - no registers no regs register file register file 54 - ``` 55 - 56 - We're roughly at the Monsoon point on this spectrum: direct-indexed 57 - matching with presence bits (independently derived, see §2), token-driven 58 - execution, no program counter. But with a smaller/simpler PE than Monsoon 59 - (fewer pipeline stages, smaller frames, generation counters for ABA 60 - protection instead of Monsoon's tighter deallocation control). 61 - 62 - ### What IS Worth Mining from the Hybrid Work 63 - 64 - The Papadopoulos & Traub 1991 paper ("Multithreading: A Revisionist View 65 - of Dataflow Architectures") contains one microarchitectural optimisation 66 - that's relevant without changing the architecture: 67 - 68 - **Sequential scheduling of monadic chains.** If A feeds B feeds C and all 69 - are monadic (single-input), the tokens cycle through the full pipeline 70 - for each hop: token-in → match-bypass → fetch → execute → token-out → 71 - back to token-in. If the PE could recognise this pattern and keep the 72 - result in-pipeline for the next instruction (skipping token formatting 73 - and input FIFO), that's a significant latency win on sequential chains. 74 - 75 - This is a **microarchitectural shortcut**, not an architectural change. 76 - The token semantics don't change. The compiler doesn't need to know. It's 77 - just an optimisation where the PE notices "output goes to me, monadic, 78 - next instruction" and short-circuits the pipeline. Worth considering 79 - post-v0 if sequential throughput is a problem. 80 - 81 - --- 82 - 83 - ## 2. Convergence with Monsoon's Explicit Token Store 84 - 85 - The matching store design in `pe-design.md` — direct-indexed context slots 86 - with occupied bits, compiler-assigned slot IDs, bump allocator — was 87 - derived independently from first principles: 88 - 89 - 1. Manchester's hash table has terrible utilisation (<20%) and enormous 90 - hardware cost (16 SRAM banks + comparators per PE) 91 - 2. Amamiya's semi-CAM is better but CAM chips are tiny at discrete scale 92 - 3. If the compiler assigns context IDs statically, you can use them as 93 - direct SRAM addresses → no associative lookup at all 94 - 4. Occupied bit = 1-bit presence flag per matching entry 95 - 5. Generation counter handles ABA on slot reuse 96 - 97 - This turns out to be essentially the same thing Papadopoulos and Culler 98 - arrived at with the Explicit Token Store (ETS) for Monsoon (1990), via 99 - a similar line of reasoning from the TTDA's frame-based matching (Arvind 100 - & Nikhil 1987/1990). 101 - 102 - **Key differences from Monsoon ETS:** 103 - 104 - | Aspect | Monsoon ETS | This design | 105 - |--------|-------------|-------------| 106 - | Frame size | 128 words (fixed) | 16-32 entries (configurable) | 107 - | Allocation | Shared free-list of frame pointers | Bump allocator + bitmap/FIFO | 108 - | ABA protection | Tight dealloc control, no reuse until drained | 2-bit generation counter per slot | 109 - | Pipeline depth | 8 stages | 5 stages (target) | 110 - | Matching entry ID | Compiler-assigned slot offset in frame | Compiler-assigned match_entry in context slot | 111 - | Overflow | Not handled (compiler must fit) | Stall + optional future CAM buffer | 112 - 113 - The generation counter is more defensive than Monsoon's approach, which 114 - is appropriate for a first build where catching bugs matters more than 115 - saving 2 bits per slot. Monsoon's free-list is cleaner in theory but the 116 - bump allocator is simpler hardware (counter vs. FIFO management). 117 - 118 - **Actionable insight from this convergence:** the ETS papers (especially 119 - Papadopoulos's 1988 PhD thesis) contain detailed pipeline timing, hazard 120 - analysis, and state-bit logic that's directly applicable to our matching 121 - store, even though the designs were arrived at independently. Worth 122 - reading for implementation details, not just architecture. 123 - 124 - --- 125 - 126 - ## 3. Clock Efficiency as Primary PE Constraint 127 - 128 - In discrete logic, clock speed is the hard constraint. Individual gate 129 - delays are ~10-30ns per stage depending on technology, and pipeline depth 130 - directly multiplies total latency. At realistic clock speeds (5-20 MHz 131 - for well-designed discrete logic), every wasted cycle is expensive. 132 - 133 - This means: 134 - - **Single-cycle matching is non-negotiable** (achieved via direct indexing) 135 - - **Pipeline depth should be minimised** — each stage adds a clock period 136 - of latency. Monsoon's 8 stages would give 400-1600ns per token at 137 - discrete-logic speeds. Our target of 5 stages is aggressive but 138 - achievable. 139 - - **Monadic bypass matters** — monadic tokens skipping the matching stage 140 - saves a full cycle per monadic operation. At 50% monadic ops (typical 141 - for many programs), this is significant. 142 - - **Network latency is the enemy** — every hop between PEs adds pipeline 143 - latency. Compiler-assigned locality (keeping communicating nodes on 144 - the same PE) is critical. This is why we care about static PE 145 - assignment even though matching is dynamic. 146 - - **The sequential scheduling shortcut (§1) becomes more valuable** the 147 - slower the clock is — if a monadic chain of 5 ops takes 25 cycles 148 - through the full pipeline but could take 5 cycles with short-circuit 149 - execution, that's 100-500ns saved at discrete speeds. 150 - 151 - ### Implications for PE Design 152 - 153 - Every pipeline stage must justify its existence in terms of critical path. 154 - If a stage can be merged with an adjacent stage without extending the 155 - critical path beyond the target clock period, merge it. 156 - 157 - Stages that are "free" (can overlap with SRAM access time): address 158 - generation, mux selection, comparator setup. 159 - 160 - Stages that set the clock period: SRAM read (15-25ns for fast async SRAM), 161 - ALU operation (depends on width — 8-bit add ~15ns in 74HC, 16-bit ~25ns 162 - with carry lookahead). 163 - 164 - --- 165 - 166 - ## 4. Technology Notes 167 - 168 - ### Not Strictly TTL 169 - 170 - The project is described as "74-series TTL + SRAM" but the actual target 171 - technology is more nuanced: 172 - 173 - - **74HC / 74HCT CMOS** is the likely primary logic family, not original 174 - 74-series TTL. HC/HCT gives lower power, better noise margins, and 175 - similar or better speed at the gate level. HCT is input-compatible 176 - with TTL levels. 177 - - **74AC / 74ACT** (Advanced CMOS) for critical-path stages where the 178 - extra speed matters. ~5ns propagation vs ~10ns for HC. 179 - - **74F** (FAST TTL) is an option for specific high-speed paths but draws 180 - more power and is less available. 181 - - **Async SRAM** (IS61C256, AS6C4008, etc.) for all bulk storage: 182 - instruction memory, matching store, token FIFOs, structure memory. 183 - 15-25ns access times are the pipeline clock floor. 184 - - **EEPROMs** (AT28C256 or similar) for instruction memory where runtime 185 - reprogramming via type-11 is acceptable with higher write latency. 186 - Or SRAM with battery backup / external loading. 187 - 188 - The key constraint is **no large-scale integration beyond commodity SRAM 189 - and EEPROM**. No FPGAs in the final build (though FPGA prototyping is 190 - encouraged). No custom ASICs. No microcontrollers in the datapath (though 191 - a microcontroller as external test fixture / bootstrap host is fine for 192 - development). 193 - 194 - The "period-plausible" framing refers to the transistor budget being 195 - comparable to processors from the late 1970s / early 1980s (68000-class), 196 - not to the specific technology used. Modern CMOS 74-series parts are 197 - faster and lower power than original TTL but the logic complexity and 198 - design methodology are the same. 199 - 200 - --- 201 - 202 - ## 5. Priority Reading List (from Prior Art Survey) 203 - 204 - Based on the reference guide and current design state, prioritised for 205 - maximum impact on near-term design decisions: 206 - 207 - ### Must-Read (directly affects current design choices) 208 - 209 - 1. **Papadopoulos & Culler, "Monsoon: An Explicit Token-Store 210 - Architecture" (ISCA 1990)** — ETS mechanics, 8-stage pipeline, 211 - frame memory organisation. Closest prior art to our matching store. 212 - 213 - 2. **Culler & Papadopoulos, "The Explicit Token Store" (JPDC 1990)** — 214 - Extended journal version with more detail on state-bit mechanism 215 - and pipeline stages. 216 - 217 - 3. **Papadopoulos PhD thesis (MIT, 1988)** — THE most detailed Monsoon 218 - hardware source. Board-level design, chip selection, pipeline timing. 219 - Hard to find but worth the effort. 220 - 221 - 4. **Sakai et al., "An Architecture of a Dataflow Single Chip Processor" 222 - (ISCA 1989)** — EM-4 core paper. 50K-gate PE, circular pipeline, 223 - direct matching, strongly connected arc model. Most sophisticated 224 - pipeline design in the literature. 225 - 226 - 5. **da Silva & Watson, "A Pseudo-Associative Matching Store with Hardware 227 - Hashing" (IEE 1983)** — Even though we're not using hash matching, 228 - understanding WHY Manchester went this way and what the tradeoffs 229 - were informs our design. 230 - 231 - 6. **Culler, "Resource Management for the Tagged Token Dataflow 232 - Architecture" (MIT TR-332, 1985)** — Token store overflow, deadlock, 233 - frame-space management. Essential for understanding throttling and 234 - resource constraints. 235 - 236 - ### Should-Read (informs broader design context) 237 - 238 - 7. **Dennis, "Building Blocks for Data Flow Prototypes" (ISCA 1980)** — 239 - Modular hardware building blocks for discrete-logic dataflow. May 240 - directly influence our board-level module decomposition. 241 - 242 - 8. **Sakai et al., EM-4 network paper (Parallel Computing 1993)** — 243 - Circular omega network design and deadlock prevention. Relevant 244 - when we scale past shared bus. 245 - 246 - 9. **Arvind & Nikhil, "Executing a Program on the MIT TTDA" (IEEE TC 247 - 1990)** — TTDA PE organisation, tag format, I-structure memory. 248 - Foundational context for understanding Monsoon. 249 - 250 - 10. **Papadopoulos & Traub, "Multithreading: A Revisionist View" (ISCA 251 - 1991)** — Sequential scheduling optimisation. Not for the 252 - architecture, but for the microarchitectural shortcut idea. 253 - 254 - ### Background (fills in the picture) 255 - 256 - 11. **Lee & Hurson, "Dataflow Architectures and Multithreading" (IEEE 257 - Computer 1994)** — Survey bridging pure dataflow to multithreading era. 258 - 259 - 12. **Arvind & Culler, "Dataflow Architectures" (Annual Review 1986)** — 260 - MIT perspective, good overview of the design space. 261 - 262 - 13. **Grafe et al., "The Epsilon Dataflow Processor" (ISCA 1989)** — 263 - Hybrid approach (Sandia), interesting as a contrast to show where 264 - we DON'T want to go. 265 - 266 - 14. **Watson & Gurd, "A Practical Data Flow Computer" (IEEE Computer 267 - 1982)** — Board-level Manchester hardware aimed at hardware engineers. 268 - 269 - --- 270 - 271 - ## 6. Key Open Questions Informed by Research 272 - 273 - Things the prior art survey surfaced that we should think about: 274 - 275 - 1. **Monsoon's 128-word frames vs our 16-32 entry slots**: are we too 276 - small? Monsoon's larger frames reduce allocation frequency but waste 277 - space on small activations. Our smaller slots are more efficient but 278 - may cause more allocation churn. Need to compile some test programs 279 - and measure. 280 - 281 - 2. **EM-4's circular pipeline**: their PE reuses pipeline stages for 282 - different phases of token processing, reducing total hardware per PE. 283 - Worth investigating whether our 5-stage pipeline could benefit from 284 - a similar trick. 285 - 286 - 3. **EM-4's strongly connected arc model**: a different take on monadic 287 - chains where consecutive operations within a thread execute without 288 - re-entering the network. Related to the sequential scheduling idea 289 - but architecturally distinct. Need to read the papers to understand 290 - the hardware implications. 291 - 292 - 4. **I-structure memory (Arvind)**: presence bits on structure memory 293 - words for synchronisation. Our SM doesn't currently have this — SM 294 - operations are simple read/write/RMW. I-structures enable deferred 295 - reads (read of empty word blocks until write arrives). This is a 296 - significant capability for certain parallel patterns. Worth 297 - evaluating whether SM should support it. 298 - 299 - 5. **Dennis's "Building Blocks" approach**: modular, composable hardware 300 - units for dataflow. May suggest a different physical decomposition 301 - than our current CM/SM/IO split. Need to read the paper.

-290

design-notes/versions/architecture-overview.md

··· 1 - # Dynamic Dataflow CPU — Architecture Overview 2 - 3 - Master reference document. For detailed design of individual subsystems, see 4 - companion documents. For rejected/deferred approaches and decision rationale, 5 - see `design-alternatives.md`. 6 - 7 - ## Companion Documents 8 - 9 - - `pe-design.md` — PE pipeline, matching store, instruction memory, context slots 10 - - `sm-design.md` — structure memory interface, operations, banking, address space 11 - - `network-and-communication.md` — interconnect, routing, clocking, handshaking 12 - - `io-and-bootstrap.md` — I/O subsystem, bootstrap sequence, type-11 protocol 13 - - `design-alternatives.md` — rejected/deferred approaches with rationale 14 - 15 - ## Project Goals 16 - 17 - - Dynamic dataflow CPU achievable with discrete logic (74-series TTL + SRAM) 18 - - Multi-PE design targeting superscalar-equivalent IPC 19 - - "Period-plausible" transistor budget: ~25-35K logic transistors + SRAM chips 20 - - Comparable to a 68000 or a couple of Z80s in logic complexity 21 - - Reference builds for physical scale: Fabian Schuiki's superscalar CPU, 22 - James Sharman's pipelined CPU 23 - - Must be able to load and execute a binary over serial without a substantial 24 - conventional control core 25 - - Incremental build plan: single PE first, expand to multi-PE 26 - - Architecture must not rule out future evolution: specifically, must preserve 27 - design space for asynchronous operation, network topology changes, and 28 - runtime reprogramming 29 - 30 - ## Key Architectural Decisions 31 - 32 - ### Execution Model 33 - - **Dynamic dataflow** (tagged-token), not static like the Electron E1 34 - - Compiler performs static PE assignment and routing configuration (E1-like) 35 - - Matching store operates dynamically within each PE for concurrent activations 36 - - This is a hybrid: static routing topology, dynamic operand matching 37 - 38 - ### Influences / Reference Architectures 39 - - **Manchester Dataflow Machine** (Gurd 1985): pipeline structure, matching 40 - unit design, overflow handling 41 - - **DFM / Amamiya 1982**: semi-CAM concept, computational locality, 42 - function-instance-based addressing, CM/SM split, TTL prototype 43 - - **Pao et al. (IP lookup)**: subtree bit-vector parallel search via bitwise 44 - AND — useful for collision resolution or routing 45 - - **Electron E1**: compile-time spatial mapping, tile-based PEs, control core 46 - for bootstrap 47 - - **Yang et al. (DDR SDRAM IP lookup)**: hash + small CAM for collision overflow 48 - 49 - ### Data Width 50 - - 8 or 16-bit data words within PEs (TBD, likely 16-bit for practicality) 51 - - Internal token packets are wider (~24-32 bits for local, multi-flit for remote) 52 - - Instruction words will be "chunkier" due to tags/destinations 53 - 54 - ## Token Packet Format (type-tagged, 32-bit) 55 - 56 - The 2-bit type field is the primary routing discriminator. It determines both 57 - the physical destination (which class of module receives the packet) and the 58 - interpretation of the remaining 30 bits. 59 - 60 - ### Type Field Semantics 61 - 62 - ``` 63 - Type 00 — DYADIC: destination is a CM. token carries operand for a dyadic 64 - (two-input) instruction. requires matching store lookup. 65 - Type 01 — MONADIC: destination is a CM. token carries operand for a monadic 66 - (single-input) instruction. bypasses matching store. 67 - Type 10 — STRUCTURE: destination is an SM bank. carries a memory operation 68 - request (read, write, atomic RMW, etc.). 69 - Type 11 — SYSTEM: destination is the I/O subsystem, OR carries an extended/ 70 - config operation. subtype field discriminates: 71 - 11 + 00: I/O operation (routed to I/O controller) 72 - 11 + 01: extended address / config write (e.g., remote instruction 73 - memory write, routing table config) 74 - 11 + 10: reserved (future: debug/trace, DMA) 75 - 11 + 11: reserved 76 - ``` 77 - 78 - Types 00/01 hit CMs only. Type 10 hits SM banks only. Type 11 can hit the 79 - I/O controller, target PEs (for config writes), or future system infrastructure 80 - depending on subtype. 81 - 82 - ### Packet Formats 83 - 84 - ``` 85 - Type 00 — DYADIC (needs matching, carries generation counter): 86 - [type:2][PE_id:2][ctx_slot:4][gen:2][offset:7][port:1][data:14] 87 - 88 - Type 01 — MONADIC (bypass matching, no gen needed): 89 - [type:2][PE_id:2][offset:8][data:20] 90 - 91 - Type 10 — STRUCTURE (memory access to SM): 92 - [type:2][SM_id:2][operation:3][address:9][data:16] 93 - 94 - Type 11 — SYSTEM (I/O, extended addressing, config): 95 - [type:2][subtype:2][...28 bits interpreted per subtype...] 96 - Subtype 00 (I/O): [device:N][register:N][R/W:1][data:...] 97 - Subtype 01 (config): [target_PE:2][target_addr:...][data:...] 98 - (exact bit allocation TBD per subtype — 28 bits of payload is generous) 99 - Multi-flit when 28 bits isn't enough (config writes carrying full 100 - instruction words). 101 - ``` 102 - 103 - ### Key Design Rationale 104 - - Different token types have different overhead requirements — no point paying 105 - generation counter + context slot tax on monadic ops or memory accesses 106 - - Dyadic tokens carry 14-bit data (sufficient for most intermediates; full 107 - 16-bit literals can be loaded via monadic "load immediate" feeding into 108 - dyadic node) 109 - - Monadic tokens get full 20-bit data payload on same 32-bit bus 110 - - Structure tokens carry full 16-bit data + 9-bit address for SM operations 111 - - Type 11 is the system management channel: I/O, config, and future 112 - debug/trace infrastructure all live here with subtype discrimination 113 - - Generation counter (2-bit) ONLY on dyadic tokens — prevents ABA problem 114 - when context slots are reused after deallocation 115 - - 32-bit bus width works with 8-bit-wide SRAM (4 bytes per token) 116 - - If 14-bit dyadic data is too tight, bump to 36-bit bus (9 nibbles, works 117 - with 4-bit-wide SRAM). Decision deferred. 118 - 119 - ## Module Taxonomy 120 - 121 - ### CM (Control Module) — execution and matching 122 - - Instruction memory (IM): stores dataflow program (function bodies) 123 - - **Runtime-writable** via type-11 config packets from the network 124 - - Write from network stalls the pipeline (acceptable for config operations) 125 - - Enables runtime reprogramming and eliminates need for separate config bus 126 - - Operand memory (OM) / matching store: buffers arriving operands, performs 127 - matching 128 - - Receives tokens from CN (types 00/01) and DN (SM results repackaged as 129 - type 00/01), produces tokens to CN and AN 130 - - Contains the bump allocator, throttle, and generation counter logic 131 - - Each PE has a unique ID, set via EEPROM (instruction decoder doubles as 132 - ID store) or DIP switches during prototyping 133 - - See `pe-design.md` for pipeline details 134 - 135 - ### SM (Structure Memory) — data storage and structure operations 136 - - Banked data memory (cells) for arrays, lists, heap data 137 - - Embedded functional units for structure operations (read, write, atomic 138 - RMW, etc.) 139 - - Receives operation requests via AN (type 10), returns results via DN 140 - (repackaged as type 00/01 tokens) 141 - - Operates asynchronously from CMs — split-phase memory access 142 - - Pure data storage — no I/O mapping (I/O lives in the type-11 subsystem) 143 - - See `sm-design.md` for interface and banking details 144 - 145 - ### I/O Controller — peripheral interface 146 - - Fixed-function device on the network, NOT a full PE 147 - - Receives type-11 subtype-00 packets, interprets as I/O commands 148 - - Returns results as type 00/01 tokens to the requesting CM 149 - - Can spontaneously emit tokens (unsolicited I/O: UART RX, interrupt 150 - equivalent) — the only network participant that generates tokens 151 - without receiving one first 152 - - Also handles type-11 subtype-01 during bootstrap (reading from UART/flash, 153 - formatting config writes to load programs into PEs) 154 - - See `io-and-bootstrap.md` for design details 155 - 156 - ### Three Logical Interconnects (shared physical bus for v0) 157 - 158 - ``` 159 - CN (Communication Network): CM <-> CM, types 00/01 160 - AN (Arbitration Network): CM -> SM, type 10 161 - DN (Distribution Network): SM -> CM, type 10 results repackaged as 00/01 162 - System channel: any <-> I/O controller, type 11 163 - ``` 164 - 165 - For v0 (4 PEs + 1-2 SMs + I/O controller), all traffic shares a single 166 - physical bus with type-based routing. Routing nodes inspect the type field 167 - and forward to the appropriate destination. Multiple packets can be in 168 - flight simultaneously if the bus is pipelined with latches at each stage. 169 - 170 - The AN/DN can be split onto separate physical paths later if SM access 171 - contention becomes a bottleneck. The type-field-based routing means this 172 - is a topology change, not a protocol change — no module interfaces need 173 - to change. 174 - 175 - See `network-and-communication.md` for routing, clocking, and scaling details. 176 - 177 - ## Transistor Budget Estimate (4-PE system) 178 - 179 - | Component | Transistors | 180 - |-----------|------------| 181 - | 4x PE logic | 20-32K | 182 - | Routing network (4 PEs) | 2-3K | 183 - | I/O controller | ~1-2K | 184 - | **Total logic** | **~25-35K** | 185 - | SRAM chips (instruction mem, matching stores, token queues) | 8-16 chips | 186 - 187 - Note: bootstrap microsequencer removed from budget — bootstrap is handled 188 - by the I/O controller + type-11 config writes, or by an external 189 - microcontroller during early prototyping. No dedicated bootstrap hardware 190 - in the final architecture. 191 - 192 - ## IPC / Performance Expectations 193 - 194 - - "Superscalar" is the wrong term for dataflow — there's no single 195 - instruction stream 196 - - With 4 PEs and single-cycle matching (common case), peak is 4 ops/clock 197 - - Realistic sustained throughput depends on: 198 - - Network crossing frequency (adds routing latency) 199 - - Hash path hits vs direct index (matching latency) 200 - - Available parallelism in the program 201 - - Network contention (shared bus at v0 scale) 202 - - Parallel workloads (matrix multiply, FFT): near peak 203 - - Sequential/pointer-chasing code: ~0.5-1 ops/clock (still competitive 204 - with 6502) 205 - - Key insight: matching store performance is the primary bottleneck, as 206 - Manchester discovered 207 - 208 - ## Build Order 209 - 210 - ### Phase 0: SM (Structure Memory) — BUILD FIRST 211 - - Self-contained module, testable in isolation 212 - - Drive with microcontroller (Arduino/RP2040) for testing 213 - - Defined interface: receive operation request, process, return result 214 - - Key deliverables: 215 - - Banked SRAM with address decoding 216 - - Simple operation unit (read/write at minimum, cons/car/cdr stretch goals) 217 - - Input interface (receive request packets) 218 - - Output interface (send result packets) 219 - - Test harness: microcontroller sends requests, validates responses 220 - 221 - ### Phase 1: CM (Control Module) — single PE 222 - - Instruction memory (SRAM) 223 - - Matching store with direct-indexed context slots 224 - - Bump allocator + throttle + generation counters 225 - - 8/16-bit ALU 226 - - Token FIFO (input) 227 - - Token output formatting 228 - - Test with microcontroller injecting tokens, verify matching + execution 229 - 230 - ### Phase 2: CM + SM pair 231 - - Connect via shared bus with type routing 232 - - Load a program using microcontroller (external, via type-11 config writes 233 - or direct SRAM programming) 234 - - Execute a dataflow graph that uses structure memory 235 - - First real program: fibonacci, small FFT, or similar 236 - 237 - ### Phase 3: Multi-module 238 - - Second CM, routing network 239 - - Prove cross-PE token routing works 240 - - Demonstrate actual parallel execution speedup 241 - 242 - ### Phase 4: System 243 - - Expand to 4 CMs + 1-2 SMs 244 - - I/O controller (type-11 subsystem) with UART 245 - - Bootstrap via I/O controller reading from flash/serial 246 - - ISR support (compiler-assigned PE with interrupt token injection from 247 - I/O controller) 248 - - Performance benchmarking vs period-equivalent CPUs 249 - 250 - ## Open Questions / Next Steps 251 - 252 - 1. **SM internal design** — CURRENT FOCUS: banking scheme, operation set, 253 - interface protocol 254 - 2. **Matching store SRAM addressing** — detailed direct-index + hash fallback 255 - scheme 256 - 3. **Context slot count per CM** — 4 bits = 16 slots (12KB SRAM each) vs wider 257 - 4. **Data width decision** — 14-bit dyadic payload okay, or bump bus to 36 bits? 258 - 5. **Instruction encoding** — operation set, format, how wide 259 - 6. **Type-11 packet format** — exact bit allocation for I/O and config subtypes 260 - 7. **I/O controller internal design** — state machine, UART bridge, unsolicited 261 - token generation 262 - 8. **Compiler / assembler** — hand-written dataflow asm for v0, assembler that 263 - packs token fields 264 - 9. **Monadic/dyadic optimisation** — deferred, revisit after v0 matching store 265 - works 266 - 267 - ## Key Papers in Project 268 - 269 - - `gurd1985.pdf` — Manchester Dataflow Machine (matching unit details, 270 - overflow, pipeline) 271 - - `Dataflow_Machine_Architecture.pdf` — Veen survey (comprehensive overview, 272 - matching space analysis) 273 - - `amamiya1982.pdf` — DFM architecture (semi-CAM, structure memory, TTL 274 - prototype) 275 - - `17407_17358.pdf` — DFM evaluation (implementation details, benchmarks, 276 - VLSI projection) 277 - - `efficienthardwarearchitectureforfastipaddresslookup.pdf` — Pao et al. 278 - (binary-trie partitioning, bit-vector parallel search, SRAM pipeline) 279 - - `mclaughlin2005.pdf` — IP lookup survey (comparison of trie vs hash 280 - approaches in hardware) 281 - - `HighperformanceIPlookupcircuitusingDDRSDRAM.pdf` — Yang et al. (hash + 282 - CAM overflow, DDR burst for multi-bank) 283 - - `NonStrict_Execution_in_Parallel_and_Distributed_C.pdf` — non-strict 284 - execution, split-phase memory 285 - - `NATLS219821.pdf` — National Semiconductor 100142 CAM chip (4x4-bit, 286 - reference for discrete CAM scale) 287 - - `MOSES071271.pdf` — Motorola MCM69C233 CAM (32-bit match width, reference 288 - for CAM interface design) 289 - - `yuba1983.pdf` — Yuba et al. (PE pipeline sections, pseudo-result handling, 290 - packet formats)

-282

design-notes/versions/dataflow-cpu-architecture-notes(1).md

··· 1 - # Dynamic Dataflow CPU — Architecture Notes 2 - 3 - ## Project Goals 4 - 5 - - Dynamic dataflow CPU achievable with discrete logic (74-series TTL + SRAM) 6 - - Multi-PE design targeting superscalar-equivalent IPC 7 - - "Period-plausible" transistor budget: ~25-35K logic transistors + SRAM chips 8 - - Comparable to a 68000 or a couple of Z80s in logic complexity 9 - - Reference builds for physical scale: Fabian Schuiki's superscalar CPU, James Sharman's pipelined CPU 10 - - Must be able to load and execute a binary over serial without a substantial conventional control core 11 - - Incremental build plan: single PE first, expand to multi-PE 12 - 13 - ## Key Architectural Decisions 14 - 15 - ### Execution Model 16 - - **Dynamic dataflow** (tagged-token), not static like the Electron E1 17 - - Compiler performs static PE assignment and routing configuration (E1-like) 18 - - Matching store operates dynamically within each PE for concurrent activations 19 - - This is a hybrid: static routing topology, dynamic operand matching 20 - 21 - ### Influences / Reference Architectures 22 - - **Manchester Dataflow Machine** (Gurd 1985): pipeline structure, matching unit design, overflow handling 23 - - **DFM / Amamiya 1982**: semi-CAM concept, computational locality, function-instance-based addressing 24 - - **Pao et al. (IP lookup)**: subtree bit-vector parallel search via bitwise AND — useful for collision resolution or routing 25 - - **Electron E1**: compile-time spatial mapping, tile-based PEs, control core for bootstrap 26 - - **Yang et al. (DDR SDRAM IP lookup)**: hash + small CAM for collision overflow 27 - 28 - ### Data Width 29 - - 8 or 16-bit data words within PEs (TBD, likely 16-bit for practicality) 30 - - Internal token packets are wider (~24-32 bits for local, multi-flit for remote) 31 - - Instruction words will be "chunkier" due to tags/destinations 32 - 33 - ### Token Packet Format (type-tagged, 32-bit) 34 - 35 - Four token types, distinguished by 2-bit type field: 36 - 37 - ``` 38 - Type 00 — DYADIC (needs matching, carries generation counter): 39 - [type:2][PE_id:2][ctx_slot:4][gen:2][offset:7][port:1][data:14] 40 - 41 - Type 01 — MONADIC (bypass matching, no gen needed): 42 - [type:2][PE_id:2][offset:8][data:20] 43 - 44 - Type 10 — STRUCTURE (memory access to SM): 45 - [type:2][SM_id:2][operation:3][address:9][data:16] 46 - 47 - Type 11 — EXTENDED (multi-flit, remote/interrupt/control): 48 - [type:2][subtype:4][...first 26 bits of extended payload...] 49 - [...second flit: remaining 32 bits of payload...] 50 - ``` 51 - 52 - Key design rationale: 53 - - Different token types have different overhead requirements — no point paying 54 - generation counter + context slot tax on monadic ops or memory accesses 55 - - Dyadic tokens carry 14-bit data (sufficient for most intermediates; full 16-bit 56 - literals can be loaded via monadic "load immediate" feeding into dyadic node) 57 - - Monadic tokens get full 20-bit data payload on same 32-bit bus 58 - - Structure tokens carry full 16-bit data + 9-bit address for SM operations 59 - - Extended type is the escape hatch: remote routing, interrupts, control signals 60 - - Generation counter (2-bit) ONLY on dyadic tokens — prevents ABA problem 61 - when context slots are reused after deallocation 62 - - 32-bit bus width works with 8-bit-wide SRAM (4 bytes per token) 63 - - If 14-bit dyadic data is too tight, bump to 36-bit bus (9 nibbles, works with 64 - 4-bit-wide SRAM). Decision deferred. 65 - 66 - ### Context Slot Lifecycle 67 - 68 - - **Allocation**: bump allocator (counter + register) per PE, assigns slot ID on 69 - function activation. Trivial hardware: counter, adder, gate. 70 - - **Deallocation**: compiler inserts explicit "free" instruction on every exit path 71 - of a function body. Multiple frees are harmless (idempotent). 72 - - **ABA protection**: 2-bit generation counter per slot, incremented on each 73 - reallocation. Tokens carry generation they were created under. Mismatch = 74 - stale token, discarded. 4 generations before wraparound; stale tokens drain 75 - in 2-5 cycles, so wraparound collision is effectively impossible. 76 - - **Throttle**: saturating counter tracks active slots per PE. When full, stalls 77 - new allocations until a free occurs. Hardware cost: counter + comparator + gate. 78 - (~10 TTL chips) 79 - 80 - ### Token Routing Network 81 - - **Hierarchical prefix-based routing**, NOT Manchester-style omega network 82 - - Omega networks have fixed latency regardless of distance (bad — "DRAM from the moon") 83 - - Prefix routing gives variable latency: local = 1 hop, cross-cluster = 2-3 hops 84 - - Average latency depends on program locality, which the compiler can optimise 85 - - Each routing node has a small prefix lookup table, configured at program load time 86 - - Top bits of PE_id select cluster, lower bits select within cluster 87 - - Pao's bitwise AND trick potentially useful for routing decisions or small associative lookups at routing nodes 88 - 89 - ### Module Architecture: CM/SM Split (Amamiya-inspired) 90 - 91 - Two module types with distinct roles, connected by potentially separate buses: 92 - 93 - **CM (Control Module)** — execution and matching: 94 - - Instruction memory (IM): stores dataflow program (function bodies) 95 - - Operand memory (OM) / matching store: buffers arriving operands, performs matching 96 - - Receives tokens from CN and DN, produces tokens to CN and AN 97 - - Contains the bump allocator, throttle, and generation counter logic 98 - 99 - **SM (Structure Memory)** — data storage and structure operations: 100 - - Banked data memory (cells) for arrays, lists, heap data 101 - - Embedded functional units for structure operations (read, write, cons, car, cdr, etc.) 102 - - Receives operation requests via AN, returns results via DN 103 - - Operates asynchronously from CMs — split-phase memory access 104 - 105 - **Three interconnects (can share physical bus with type-based routing, or be separate):** 106 - - **CN** (Communication Network): CM-to-CM, carries dyadic/monadic tokens (types 00, 01) 107 - - **AN** (Arbitration Network): CM-to-SM, carries structure operation requests (type 10) 108 - - **DN** (Distribution Network): SM-to-CM, carries structure operation results 109 - 110 - Rationale for the split: 111 - - Different traffic types have different width requirements — no need to force 112 - them all onto one fat bus 113 - - SM can handle memory operations concurrently while CMs continue matching/executing 114 - - SM has its own functional units, so memory operations don't consume CM ALU cycles 115 - - SM banking allows parallel access from multiple CMs, reducing contention 116 - - Aligns with Amamiya 1982 DFM architecture (prototype built in TTL) 117 - 118 - ### Matching Store Design (highest-risk component) 119 - - **Primary path: direct-indexed context slots** (Amamiya-style semi-CAM) 120 - - Bump allocator (counter + register) assigns context slot IDs to function activations 121 - - Context slot ID directly addresses a bank of SRAM 122 - - Instruction offset within function body used as direct address within that bank 123 - - Single-cycle matching for the common case — no hashing, no search 124 - - **Fallback path: hash-based matching** for dynamic/overflow cases 125 - - Multiplicative hashing: `(a * K) >> (w - m)` — simple to implement in hardware 126 - - Multi-bank (4-8 banks) checked in parallel for collision tolerance (Manchester-style set-associative) 127 - - Overflow to linked list or dedicated overflow buffer for worst case 128 - - **Compiler-assisted tag assignment**: 129 - - Static-lifetime values get contiguous, dense tags — sequential readout, no hashing 130 - - Dynamic activations get allocated tags via bump allocator 131 - - Potential for hybrid: half of matching store uses precalculated tags, half uses runtime hash 132 - - **Deallocation / reuse**: 133 - - Explicit "free" instruction on every function exit path (compiler-inserted) 134 - - Multiple frees are idempotent / harmless 135 - - Generation counter (2-bit) prevents ABA problem on slot reuse 136 - - Throttle (saturating counter) prevents matching store overflow 137 - - **Monadic/dyadic optimisation (optional)**: 138 - - Compiler assigns matching store indices only to dyadic nodes 139 - - Monadic nodes bypass matching, don't consume matching store cells 140 - - Requires indirection: matching store cell includes instruction address pointer 141 - - Cell width increases (~8 bits for instr_addr) but cell count decreases (~60% fewer) 142 - - local_offset in token = matching store index, NOT instruction address 143 - - Deferred for v0: simpler to have local_offset = instruction address = matching store address 144 - 145 - ### PE Pipeline (5-stage sketch) 146 - 147 - ``` 148 - Stage 1: TOKEN INPUT 149 - - Receive token from network 150 - - Buffer in small FIFO (8-deep, 32-bit) 151 - - ~1K transistors (flip-flops) or use small SRAM 152 - 153 - Stage 2: MATCH / BYPASS 154 - - Direct-index into context slot array (common case, single cycle) 155 - - Hash path for dynamic/overflow (multi-cycle) 156 - - Monadic instructions bypass matching entirely 157 - - Estimated: 2-3K transistors + SRAM 158 - 159 - Stage 3: INSTRUCTION FETCH 160 - - Use local offset to read from PE's instruction SRAM 161 - - External SRAM chip, so just address generation logic 162 - - ~200 transistors of logic 163 - 164 - Stage 4: EXECUTE 165 - - 8/16-bit ALU 166 - - ~500-2000 transistors depending on width and features 167 - 168 - Stage 5: TOKEN OUTPUT 169 - - Form result token with routing prefix 170 - - Inject into network 171 - - ~300 transistors 172 - ``` 173 - 174 - Pipeline registers between stages: ~500 transistors 175 - Control logic (state machine, handshaking): ~500-1000 transistors 176 - 177 - **Per-PE total: ~5-8K transistors of logic + SRAM chips** 178 - 179 - ### Transistor Budget Estimate (4-PE system) 180 - 181 - | Component | Transistors | 182 - |-----------|------------| 183 - | 4x PE logic | 20-32K | 184 - | Routing network (4 PEs) | 2-3K | 185 - | Bootstrap/loader microsequencer | 1-2K | 186 - | **Total logic** | **~25-35K** | 187 - | SRAM chips (instruction mem, matching stores, token queues) | 8-16 chips | 188 - 189 - ### Bootstrap / Program Loading 190 - - Hardwired microsequencer (NOT a full CPU) 191 - - Receives serial data, writes to instruction memory and routing tables via dedicated configuration bus 192 - - Config bus is separate from the token network 193 - - ROM-based state machine + UART receiver + bus master interface 194 - - ~20-30 TTL chips estimated 195 - - Issues "start" signal to release token flow 196 - - Alternative: a PE hardwired to run a built-in "loader" program from ROM 197 - 198 - ### Interrupt Handling 199 - - ISRs are subgraphs in the dataflow program, compiled and mapped to specific PEs like any other code 200 - - Compiler designates which PE(s) handle which interrupts 201 - - Hardware cost: edge detector on I/O pin, gated into token input FIFO of the assigned PE 202 - - Interrupt token injected into FIFO — PE doesn't need special hardware, just sees a token arrive 203 - - Priority: interrupt tokens can jump the FIFO queue (~3 extra chips) 204 - - ISR runs *concurrently* with main program on its reserved PEs — no context switch 205 - - Main program has nodes waiting for "interrupt result" tokens 206 - - Trade-off: reserved ISR PEs sit idle when no interrupts pending 207 - - Scalable: compile-time assignment means you can have multiple ISR PEs for different interrupt sources 208 - 209 - ### IPC / Performance Expectations 210 - - "Superscalar" is the wrong term for dataflow — there's no single instruction stream 211 - - With 4 PEs and single-cycle matching (common case), peak is 4 ops/clock 212 - - Realistic sustained throughput depends on: 213 - - Network crossing frequency (adds routing latency) 214 - - Hash path hits vs direct index (matching latency) 215 - - Available parallelism in the program 216 - - Parallel workloads (matrix multiply, FFT): near peak 217 - - Sequential/pointer-chasing code: ~0.5-1 ops/clock (still competitive with 6502) 218 - - Key insight: matching store performance is the primary bottleneck, as Manchester discovered 219 - 220 - ## Build Order 221 - 222 - ### Phase 0: SM (Structure Memory) — BUILD FIRST 223 - - Self-contained module, testable in isolation 224 - - Drive with microcontroller (Arduino/RP2040) for testing 225 - - Defined interface: receive operation request, process, return result 226 - - Key deliverables: 227 - - Banked SRAM with address decoding 228 - - Simple operation unit (read/write at minimum, cons/car/cdr stretch goals) 229 - - AN input interface (receive request packets) 230 - - DN output interface (send result packets) 231 - - Test harness: microcontroller sends requests, validates responses 232 - 233 - ### Phase 1: CM (Control Module) — single PE 234 - - Instruction memory (SRAM) 235 - - Matching store with direct-indexed context slots 236 - - Bump allocator + throttle + generation counters 237 - - 8/16-bit ALU 238 - - Token FIFO (input) 239 - - Token output formatting 240 - - Test with microcontroller injecting tokens, verify matching + execution 241 - 242 - ### Phase 2: CM + SM pair 243 - - Connect via AN/DN (or shared bus with type routing) 244 - - Load a program over serial via bootstrap microsequencer 245 - - Execute a dataflow graph that uses structure memory 246 - - First real program: fibonacci, small FFT, or similar 247 - 248 - ### Phase 3: Multi-module 249 - - Second CM, routing network 250 - - Prove cross-PE token routing works 251 - - Demonstrate actual parallel execution speedup 252 - 253 - ### Phase 4: System 254 - - Expand to 4 CMs + 1-2 SMs 255 - - Full bootstrap/loader microsequencer (serial load, configure routing, start) 256 - - ISR support (compiler-assigned PE with interrupt token injection) 257 - - Performance benchmarking vs period-equivalent CPUs 258 - 259 - ## Open Questions / Next Steps 260 - 261 - 1. **SM internal design** — CURRENT FOCUS: banking scheme, operation set, interface protocol 262 - 2. **Matching store SRAM addressing** — detailed direct-index + hash fallback scheme 263 - 3. **Context slot count per CM** — 4 bits = 16 slots (12KB SRAM each) vs wider 264 - 4. **Data width decision** — 14-bit dyadic payload okay, or bump bus to 36 bits? 265 - 5. **Instruction encoding** — operation set, format, how wide 266 - 6. **Routing network topology** — exact interconnect for multi-CM/SM 267 - 7. **Compiler / assembler** — hand-written dataflow asm for v0, assembler that packs token fields 268 - 8. **Monadic/dyadic optimisation** — deferred, revisit after v0 matching store works 269 - 270 - ## Key Papers in Project 271 - 272 - - `gurd1985.pdf` — Manchester Dataflow Machine (matching unit details, overflow, pipeline) 273 - - `Dataflow_Machine_Architecture.pdf` — Veen survey (comprehensive overview, matching space analysis) 274 - - `amamiya1982.pdf` — DFM architecture (semi-CAM, structure memory, TTL prototype) 275 - - `17407_17358.pdf` — DFM evaluation (implementation details, benchmarks, VLSI projection) 276 - - `efficienthardwarearchitectureforfastipaddresslookup.pdf` — Pao et al. (binary-trie partitioning, bit-vector parallel search, SRAM pipeline) 277 - - `mclaughlin2005.pdf` — IP lookup survey (comparison of trie vs hash approaches in hardware) 278 - - `HighperformanceIPlookupcircuitusingDDRSDRAM.pdf` — Yang et al. (hash + CAM overflow, DDR burst for multi-bank) 279 - - `NonStrict_Execution_in_Parallel_and_Distributed_C.pdf` — non-strict execution, split-phase memory 280 - - `NATLS219821.pdf` — National Semiconductor 100142 CAM chip (4x4-bit, reference for discrete CAM scale) 281 - - `MOSES071271.pdf` — (in project, not yet examined) 282 - - `yuba1983.pdf` — (in project, not yet examined)

-368

design-notes/versions/dataflow-cpu-architecture-notes(2).md

··· 1 - # Dynamic Dataflow CPU — Architecture Notes 2 - 3 - ## Project Goals 4 - 5 - - Dynamic dataflow CPU achievable with discrete logic (74-series TTL + SRAM) 6 - - Multi-PE design targeting superscalar-equivalent IPC 7 - - "Period-plausible" transistor budget: ~25-35K logic transistors + SRAM chips 8 - - Comparable to a 68000 or a couple of Z80s in logic complexity 9 - - Reference builds for physical scale: Fabian Schuiki's superscalar CPU, James Sharman's pipelined CPU 10 - - Must be able to load and execute a binary over serial without a substantial conventional control core 11 - - Incremental build plan: single PE first, expand to multi-PE 12 - 13 - ## Key Architectural Decisions 14 - 15 - ### Execution Model 16 - - **Dynamic dataflow** (tagged-token), not static like the Electron E1 17 - - Compiler performs static PE assignment and routing configuration (E1-like) 18 - - Matching store operates dynamically within each PE for concurrent activations 19 - - This is a hybrid: static routing topology, dynamic operand matching 20 - 21 - ### Influences / Reference Architectures 22 - - **Manchester Dataflow Machine** (Gurd 1985): pipeline structure, matching unit design, overflow handling 23 - - **DFM / Amamiya 1982**: semi-CAM concept, computational locality, function-instance-based addressing 24 - - **Pao et al. (IP lookup)**: subtree bit-vector parallel search via bitwise AND — useful for collision resolution or routing 25 - - **Electron E1**: compile-time spatial mapping, tile-based PEs, control core for bootstrap 26 - - **Yang et al. (DDR SDRAM IP lookup)**: hash + small CAM for collision overflow 27 - 28 - ### Data Width 29 - - 8 or 16-bit data words within PEs (TBD, likely 16-bit for practicality) 30 - - Internal token packets are wider (~24-32 bits for local, multi-flit for remote) 31 - - Instruction words will be "chunkier" due to tags/destinations 32 - 33 - ### Token Packet Format (type-tagged, 32-bit) 34 - 35 - Four token types, distinguished by 2-bit type field: 36 - 37 - ``` 38 - Type 00 — DYADIC (needs matching, carries generation counter): 39 - [type:2][PE_id:2][ctx_slot:4][gen:2][offset:7][port:1][data:14] 40 - 41 - Type 01 — MONADIC (bypass matching, no gen needed): 42 - [type:2][PE_id:2][offset:8][data:20] 43 - 44 - Type 10 — STRUCTURE (memory access to SM): 45 - [type:2][SM_id:2][operation:3][address:9][data:16] 46 - 47 - Type 11 — EXTENDED (multi-flit, remote/interrupt/control): 48 - [type:2][subtype:4][...first 26 bits of extended payload...] 49 - [...second flit: remaining 32 bits of payload...] 50 - ``` 51 - 52 - Key design rationale: 53 - - Different token types have different overhead requirements — no point paying 54 - generation counter + context slot tax on monadic ops or memory accesses 55 - - Dyadic tokens carry 14-bit data (sufficient for most intermediates; full 16-bit 56 - literals can be loaded via monadic "load immediate" feeding into dyadic node) 57 - - Monadic tokens get full 20-bit data payload on same 32-bit bus 58 - - Structure tokens carry full 16-bit data + 9-bit address for SM operations 59 - - Extended type is the escape hatch: remote routing, interrupts, control signals 60 - - Generation counter (2-bit) ONLY on dyadic tokens — prevents ABA problem 61 - when context slots are reused after deallocation 62 - - 32-bit bus width works with 8-bit-wide SRAM (4 bytes per token) 63 - - If 14-bit dyadic data is too tight, bump to 36-bit bus (9 nibbles, works with 64 - 4-bit-wide SRAM). Decision deferred. 65 - 66 - ### Context Slot Lifecycle 67 - 68 - - **Allocation**: bump allocator (counter + register) per PE, assigns slot ID on 69 - function activation. Trivial hardware: counter, adder, gate. 70 - - **Deallocation**: compiler inserts explicit "free" instruction on every exit path 71 - of a function body. Multiple frees are harmless (idempotent). 72 - - **ABA protection**: 2-bit generation counter per slot, incremented on each 73 - reallocation. Tokens carry generation they were created under. Mismatch = 74 - stale token, discarded. 4 generations before wraparound; stale tokens drain 75 - in 2-5 cycles, so wraparound collision is effectively impossible. 76 - - **Throttle**: saturating counter tracks active slots per PE. When full, stalls 77 - new allocations until a free occurs. Hardware cost: counter + comparator + gate. 78 - (~10 TTL chips) 79 - 80 - ### Token Routing Network 81 - - **Hierarchical prefix-based routing**, NOT Manchester-style omega network 82 - - Omega networks have fixed latency regardless of distance (bad — "DRAM from the moon") 83 - - Prefix routing gives variable latency: local = 1 hop, cross-cluster = 2-3 hops 84 - - Average latency depends on program locality, which the compiler can optimise 85 - - Each routing node has a small prefix lookup table, configured at program load time 86 - - Top bits of PE_id select cluster, lower bits select within cluster 87 - - Pao's bitwise AND trick potentially useful for routing decisions or small associative lookups at routing nodes 88 - 89 - ### SM (Structure Memory) — Detailed Design 90 - 91 - #### Interface Protocol 92 - 93 - Stateless request handling: the request token carries its own return routing info 94 - in the bits that are unused by that operation type. SM never maintains pending-request 95 - state — result packets are self-addressed. 96 - 97 - ``` 98 - READ request (data field repurposed for return routing): 99 - [type:2][SM_id:2][op:3][address:9][ret_CM:2][ret_offset:8][ret_ctx:4][ret_port:1][pad:1] 100 - 101 - WRITE request (data field carries write data, no response needed): 102 - [type:2][SM_id:2][op:3][address:9][data:16] 103 - 104 - READ_INC / READ_DEC (same as READ format — return routing in data field): 105 - [type:2][SM_id:2][op:3][address:9][ret_CM:2][ret_offset:8][ret_ctx:4][ret_port:1][pad:1] 106 - 107 - CAS — compare-and-swap (two-flit operation): 108 - Flit 1: [type:2][SM_id:2][op:3][address:9][expected_value:16] 109 - Flit 2: [new_value:16][ret_CM:2][ret_offset:8][ret_ctx:4][ret_port:1][pad:1] 110 - ``` 111 - 112 - Result packet on DN (SM -> CM): 113 - Repackaged as a dyadic or monadic token destined for [ret_CM, ret_offset, ret_ctx, ret_port] 114 - with the fetched data as payload. 115 - 116 - #### Operation Set (3-bit opcode, 8 slots) 117 - 118 - ``` 119 - 000: READ — read address, return data via DN 120 - 001: WRITE — write data to address (no DN response) 121 - 010: READ_INC — atomic fetch-and-add(+1), return old value (= atomic ptr increment) 122 - 011: READ_DEC — atomic fetch-and-add(-1), return old value (= refcount decrement) 123 - 100: CAS — compare-and-swap (two-flit), return old value + success bit 124 - 101: ALLOC — (future) allocate N cells, return base address 125 - 110: FREE — (future) mark cells as available 126 - 111: RESERVED 127 - ``` 128 - 129 - READ_INC / READ_DEC are fetch-and-add primitives — they give atomic pointer 130 - operations and reference counting without dedicated refcount hardware. CM checks 131 - returned value for zero (refcount exhausted) using its normal ALU. 132 - 133 - #### Hardware Architecture 134 - 135 - ``` 136 - AN Input Interface DN Output Interface 137 - (receive request) (send result) 138 - | ^ 139 - v | 140 - [Request FIFO] [Result FIFO] 141 - | ^ 142 - v | 143 - [Op Decoder]----+ [Result Formatter] 144 - | | ^ 145 - v v | 146 - [Addr Decode] [ALU for inc/dec/cas] [Bank Read Data] 147 - | | ^ 148 - v v | 149 - [SRAM Bank 0] [SRAM Bank 1] ... [SRAM Bank N] 150 - ``` 151 - 152 - - Banking: start with 2 banks (1 address bit selects bank) for v0 153 - - 9-bit address = 512 cells per SM = 1KB at 16-bit data width 154 - - Each bank is one SRAM chip with room to spare 155 - - ALU is minimal: increment, decrement, compare. Not a full ALU. 156 - - Op decoder determines: is this read/write/RMW? one-flit or two-flit? 157 - does it need a DN response? how to pack the result? 158 - - Result formatter extracts return routing from request, constructs DN token 159 - 160 - #### V0 Test Plan 161 - - Drive AN input with microcontroller (RP2040 / Arduino) 162 - - Microcontroller formats 32-bit request packets, clocks into request FIFO 163 - - Read 32-bit result packets from DN output FIFO 164 - - Test suite: sequential read/write, random access, read_inc sequences, 165 - bank contention (same bank back-to-back), boundary conditions 166 - 167 - #### 4x4-bit CAM chips (National Semiconductor 100142) 168 - - Available in DIP, period-appropriate 169 - - 4 words x 4 bits each — very small but potentially useful for: 170 - - Small routing tables at network nodes (4-8 entries) 171 - - Context slot allocation lookup (which slots are free) 172 - - NOT practical for bulk matching store (too few entries per chip) 173 - - Datasheet scan in project: NATLS219821.pdf 174 - 175 - ### Module Architecture: CM/SM Split (Amamiya-inspired) 176 - 177 - Two module types with distinct roles, connected by potentially separate buses: 178 - 179 - **CM (Control Module)** — execution and matching: 180 - - Instruction memory (IM): stores dataflow program (function bodies) 181 - - Operand memory (OM) / matching store: buffers arriving operands, performs matching 182 - - Receives tokens from CN and DN, produces tokens to CN and AN 183 - - Contains the bump allocator, throttle, and generation counter logic 184 - 185 - **SM (Structure Memory)** — data storage and structure operations: 186 - - Banked data memory (cells) for arrays, lists, heap data 187 - - Embedded functional units for structure operations (read, write, cons, car, cdr, etc.) 188 - - Receives operation requests via AN, returns results via DN 189 - - Operates asynchronously from CMs — split-phase memory access 190 - 191 - **Three interconnects (can share physical bus with type-based routing, or be separate):** 192 - - **CN** (Communication Network): CM-to-CM, carries dyadic/monadic tokens (types 00, 01) 193 - - **AN** (Arbitration Network): CM-to-SM, carries structure operation requests (type 10) 194 - - **DN** (Distribution Network): SM-to-CM, carries structure operation results 195 - 196 - Rationale for the split: 197 - - Different traffic types have different width requirements — no need to force 198 - them all onto one fat bus 199 - - SM can handle memory operations concurrently while CMs continue matching/executing 200 - - SM has its own functional units, so memory operations don't consume CM ALU cycles 201 - - SM banking allows parallel access from multiple CMs, reducing contention 202 - - Aligns with Amamiya 1982 DFM architecture (prototype built in TTL) 203 - 204 - ### Matching Store Design (highest-risk component) 205 - - **Primary path: direct-indexed context slots** (Amamiya-style semi-CAM) 206 - - Bump allocator (counter + register) assigns context slot IDs to function activations 207 - - Context slot ID directly addresses a bank of SRAM 208 - - Instruction offset within function body used as direct address within that bank 209 - - Single-cycle matching for the common case — no hashing, no search 210 - - **Fallback path: hash-based matching** for dynamic/overflow cases 211 - - Multiplicative hashing: `(a * K) >> (w - m)` — simple to implement in hardware 212 - - Multi-bank (4-8 banks) checked in parallel for collision tolerance (Manchester-style set-associative) 213 - - Overflow to linked list or dedicated overflow buffer for worst case 214 - - **Compiler-assisted tag assignment**: 215 - - Static-lifetime values get contiguous, dense tags — sequential readout, no hashing 216 - - Dynamic activations get allocated tags via bump allocator 217 - - Potential for hybrid: half of matching store uses precalculated tags, half uses runtime hash 218 - - **Deallocation / reuse**: 219 - - Explicit "free" instruction on every function exit path (compiler-inserted) 220 - - Multiple frees are idempotent / harmless 221 - - Generation counter (2-bit) prevents ABA problem on slot reuse 222 - - Throttle (saturating counter) prevents matching store overflow 223 - - **Monadic/dyadic optimisation (optional)**: 224 - - Compiler assigns matching store indices only to dyadic nodes 225 - - Monadic nodes bypass matching, don't consume matching store cells 226 - - Requires indirection: matching store cell includes instruction address pointer 227 - - Cell width increases (~8 bits for instr_addr) but cell count decreases (~60% fewer) 228 - - local_offset in token = matching store index, NOT instruction address 229 - - Deferred for v0: simpler to have local_offset = instruction address = matching store address 230 - 231 - ### PE Pipeline (5-stage sketch) 232 - 233 - ``` 234 - Stage 1: TOKEN INPUT 235 - - Receive token from network 236 - - Buffer in small FIFO (8-deep, 32-bit) 237 - - ~1K transistors (flip-flops) or use small SRAM 238 - 239 - Stage 2: MATCH / BYPASS 240 - - Direct-index into context slot array (common case, single cycle) 241 - - Hash path for dynamic/overflow (multi-cycle) 242 - - Monadic instructions bypass matching entirely 243 - - Estimated: 2-3K transistors + SRAM 244 - 245 - Stage 3: INSTRUCTION FETCH 246 - - Use local offset to read from PE's instruction SRAM 247 - - External SRAM chip, so just address generation logic 248 - - ~200 transistors of logic 249 - 250 - Stage 4: EXECUTE 251 - - 8/16-bit ALU 252 - - ~500-2000 transistors depending on width and features 253 - 254 - Stage 5: TOKEN OUTPUT 255 - - Form result token with routing prefix 256 - - Inject into network 257 - - ~300 transistors 258 - ``` 259 - 260 - Pipeline registers between stages: ~500 transistors 261 - Control logic (state machine, handshaking): ~500-1000 transistors 262 - 263 - **Per-PE total: ~5-8K transistors of logic + SRAM chips** 264 - 265 - ### Transistor Budget Estimate (4-PE system) 266 - 267 - | Component | Transistors | 268 - |-----------|------------| 269 - | 4x PE logic | 20-32K | 270 - | Routing network (4 PEs) | 2-3K | 271 - | Bootstrap/loader microsequencer | 1-2K | 272 - | **Total logic** | **~25-35K** | 273 - | SRAM chips (instruction mem, matching stores, token queues) | 8-16 chips | 274 - 275 - ### Bootstrap / Program Loading 276 - - Hardwired microsequencer (NOT a full CPU) 277 - - Receives serial data, writes to instruction memory and routing tables via dedicated configuration bus 278 - - Config bus is separate from the token network 279 - - ROM-based state machine + UART receiver + bus master interface 280 - - ~20-30 TTL chips estimated 281 - - Issues "start" signal to release token flow 282 - - Alternative: a PE hardwired to run a built-in "loader" program from ROM 283 - 284 - ### Interrupt Handling 285 - - ISRs are subgraphs in the dataflow program, compiled and mapped to specific PEs like any other code 286 - - Compiler designates which PE(s) handle which interrupts 287 - - Hardware cost: edge detector on I/O pin, gated into token input FIFO of the assigned PE 288 - - Interrupt token injected into FIFO — PE doesn't need special hardware, just sees a token arrive 289 - - Priority: interrupt tokens can jump the FIFO queue (~3 extra chips) 290 - - ISR runs *concurrently* with main program on its reserved PEs — no context switch 291 - - Main program has nodes waiting for "interrupt result" tokens 292 - - Trade-off: reserved ISR PEs sit idle when no interrupts pending 293 - - Scalable: compile-time assignment means you can have multiple ISR PEs for different interrupt sources 294 - 295 - ### IPC / Performance Expectations 296 - - "Superscalar" is the wrong term for dataflow — there's no single instruction stream 297 - - With 4 PEs and single-cycle matching (common case), peak is 4 ops/clock 298 - - Realistic sustained throughput depends on: 299 - - Network crossing frequency (adds routing latency) 300 - - Hash path hits vs direct index (matching latency) 301 - - Available parallelism in the program 302 - - Parallel workloads (matrix multiply, FFT): near peak 303 - - Sequential/pointer-chasing code: ~0.5-1 ops/clock (still competitive with 6502) 304 - - Key insight: matching store performance is the primary bottleneck, as Manchester discovered 305 - 306 - ## Build Order 307 - 308 - ### Phase 0: SM (Structure Memory) — BUILD FIRST 309 - - Self-contained module, testable in isolation 310 - - Drive with microcontroller (Arduino/RP2040) for testing 311 - - Defined interface: receive operation request, process, return result 312 - - Key deliverables: 313 - - Banked SRAM with address decoding 314 - - Simple operation unit (read/write at minimum, cons/car/cdr stretch goals) 315 - - AN input interface (receive request packets) 316 - - DN output interface (send result packets) 317 - - Test harness: microcontroller sends requests, validates responses 318 - 319 - ### Phase 1: CM (Control Module) — single PE 320 - - Instruction memory (SRAM) 321 - - Matching store with direct-indexed context slots 322 - - Bump allocator + throttle + generation counters 323 - - 8/16-bit ALU 324 - - Token FIFO (input) 325 - - Token output formatting 326 - - Test with microcontroller injecting tokens, verify matching + execution 327 - 328 - ### Phase 2: CM + SM pair 329 - - Connect via AN/DN (or shared bus with type routing) 330 - - Load a program over serial via bootstrap microsequencer 331 - - Execute a dataflow graph that uses structure memory 332 - - First real program: fibonacci, small FFT, or similar 333 - 334 - ### Phase 3: Multi-module 335 - - Second CM, routing network 336 - - Prove cross-PE token routing works 337 - - Demonstrate actual parallel execution speedup 338 - 339 - ### Phase 4: System 340 - - Expand to 4 CMs + 1-2 SMs 341 - - Full bootstrap/loader microsequencer (serial load, configure routing, start) 342 - - ISR support (compiler-assigned PE with interrupt token injection) 343 - - Performance benchmarking vs period-equivalent CPUs 344 - 345 - ## Open Questions / Next Steps 346 - 347 - 1. **SM internal design** — CURRENT FOCUS: banking scheme, operation set, interface protocol 348 - 2. **Matching store SRAM addressing** — detailed direct-index + hash fallback scheme 349 - 3. **Context slot count per CM** — 4 bits = 16 slots (12KB SRAM each) vs wider 350 - 4. **Data width decision** — 14-bit dyadic payload okay, or bump bus to 36 bits? 351 - 5. **Instruction encoding** — operation set, format, how wide 352 - 6. **Routing network topology** — exact interconnect for multi-CM/SM 353 - 7. **Compiler / assembler** — hand-written dataflow asm for v0, assembler that packs token fields 354 - 8. **Monadic/dyadic optimisation** — deferred, revisit after v0 matching store works 355 - 356 - ## Key Papers in Project 357 - 358 - - `gurd1985.pdf` — Manchester Dataflow Machine (matching unit details, overflow, pipeline) 359 - - `Dataflow_Machine_Architecture.pdf` — Veen survey (comprehensive overview, matching space analysis) 360 - - `amamiya1982.pdf` — DFM architecture (semi-CAM, structure memory, TTL prototype) 361 - - `17407_17358.pdf` — DFM evaluation (implementation details, benchmarks, VLSI projection) 362 - - `efficienthardwarearchitectureforfastipaddresslookup.pdf` — Pao et al. (binary-trie partitioning, bit-vector parallel search, SRAM pipeline) 363 - - `mclaughlin2005.pdf` — IP lookup survey (comparison of trie vs hash approaches in hardware) 364 - - `HighperformanceIPlookupcircuitusingDDRSDRAM.pdf` — Yang et al. (hash + CAM overflow, DDR burst for multi-bank) 365 - - `NonStrict_Execution_in_Parallel_and_Distributed_C.pdf` — non-strict execution, split-phase memory 366 - - `NATLS219821.pdf` — National Semiconductor 100142 CAM chip (4x4-bit, reference for discrete CAM scale) 367 - - `MOSES071271.pdf` — (in project, not yet examined) 368 - - `yuba1983.pdf` — (in project, not yet examined)

-178

design-notes/versions/dataflow-cpu-architecture-notes.md

··· 1 - # Dynamic Dataflow CPU — Architecture Notes 2 - 3 - ## Project Goals 4 - 5 - - Dynamic dataflow CPU achievable with discrete logic (74-series TTL + SRAM) 6 - - Multi-PE design targeting superscalar-equivalent IPC 7 - - "Period-plausible" transistor budget: ~25-35K logic transistors + SRAM chips 8 - - Comparable to a 68000 or a couple of Z80s in logic complexity 9 - - Reference builds for physical scale: Fabian Schuiki's superscalar CPU, James Sharman's pipelined CPU 10 - - Must be able to load and execute a binary over serial without a substantial conventional control core 11 - - Incremental build plan: single PE first, expand to multi-PE 12 - 13 - ## Key Architectural Decisions 14 - 15 - ### Execution Model 16 - - **Dynamic dataflow** (tagged-token), not static like the Electron E1 17 - - Compiler performs static PE assignment and routing configuration (E1-like) 18 - - Matching store operates dynamically within each PE for concurrent activations 19 - - This is a hybrid: static routing topology, dynamic operand matching 20 - 21 - ### Influences / Reference Architectures 22 - - **Manchester Dataflow Machine** (Gurd 1985): pipeline structure, matching unit design, overflow handling 23 - - **DFM / Amamiya 1982**: semi-CAM concept, computational locality, function-instance-based addressing 24 - - **Pao et al. (IP lookup)**: subtree bit-vector parallel search via bitwise AND — useful for collision resolution or routing 25 - - **Electron E1**: compile-time spatial mapping, tile-based PEs, control core for bootstrap 26 - - **Yang et al. (DDR SDRAM IP lookup)**: hash + small CAM for collision overflow 27 - 28 - ### Data Width 29 - - 8 or 16-bit data words within PEs (TBD, likely 16-bit for practicality) 30 - - Internal token packets are wider (~24-32 bits for local, multi-flit for remote) 31 - - Instruction words will be "chunkier" due to tags/destinations 32 - 33 - ### Token Packet Format (working sketch) 34 - 35 - ``` 36 - LOCAL TOKEN (single flit): 37 - [PE_id: 3b][local_offset: 8b][context_slot: 4b][port: 1b][data: 8-16b] 38 - = 24-32 bits total 39 - 40 - EXTENDED TOKEN (two flits, for cross-cluster / off-machine): 41 - Flit 1: [RESERVED_PE_id: 3b (all 1s)][extended header bits] 42 - Flit 2: [full remote destination + data] 43 - ``` 44 - 45 - - PE_id field with reserved value (all-1s) triggers extended addressing mode 46 - - Remote tokens travel as two flits on the network — no bus locking needed 47 - - Routing nodes optimised for the single-flit common case 48 - - Composable: third flit could carry inter-machine routing 49 - 50 - ### Token Routing Network 51 - - **Hierarchical prefix-based routing**, NOT Manchester-style omega network 52 - - Omega networks have fixed latency regardless of distance (bad — "DRAM from the moon") 53 - - Prefix routing gives variable latency: local = 1 hop, cross-cluster = 2-3 hops 54 - - Average latency depends on program locality, which the compiler can optimise 55 - - Each routing node has a small prefix lookup table, configured at program load time 56 - - Top bits of PE_id select cluster, lower bits select within cluster 57 - - Pao's bitwise AND trick potentially useful for routing decisions or small associative lookups at routing nodes 58 - 59 - ### Matching Store Design (highest-risk component) 60 - - **Primary path: direct-indexed context slots** (Amamiya-style semi-CAM) 61 - - Bump allocator (counter + register) assigns context slot IDs to function activations 62 - - Context slot ID directly addresses a bank of SRAM 63 - - Instruction offset within function body used as direct address within that bank 64 - - Single-cycle matching for the common case — no hashing, no search 65 - - **Fallback path: hash-based matching** for dynamic/overflow cases 66 - - Multiplicative hashing: `(a * K) >> (w - m)` — simple to implement in hardware 67 - - Multi-bank (4-8 banks) checked in parallel for collision tolerance (Manchester-style set-associative) 68 - - Overflow to linked list or dedicated overflow buffer for worst case 69 - - **Compiler-assisted tag assignment**: 70 - - Static-lifetime values get contiguous, dense tags — sequential readout, no hashing 71 - - Dynamic activations get allocated tags via bump allocator 72 - - Potential for hybrid: half of matching store uses precalculated tags, half uses runtime hash 73 - - **Deallocation / reuse**: 74 - - Bump allocator handles allocation trivially 75 - - Deallocation is the hard part 76 - - Throttle mechanism (limit concurrent activations) enables context slot reuse 77 - - For statically-verifiable lifetimes, compiler manages reuse directly 78 - - For dynamic lifetimes, track via secondary lookup (TBD) 79 - 80 - ### PE Pipeline (5-stage sketch) 81 - 82 - ``` 83 - Stage 1: TOKEN INPUT 84 - - Receive token from network 85 - - Buffer in small FIFO (8-deep, 32-bit) 86 - - ~1K transistors (flip-flops) or use small SRAM 87 - 88 - Stage 2: MATCH / BYPASS 89 - - Direct-index into context slot array (common case, single cycle) 90 - - Hash path for dynamic/overflow (multi-cycle) 91 - - Monadic instructions bypass matching entirely 92 - - Estimated: 2-3K transistors + SRAM 93 - 94 - Stage 3: INSTRUCTION FETCH 95 - - Use local offset to read from PE's instruction SRAM 96 - - External SRAM chip, so just address generation logic 97 - - ~200 transistors of logic 98 - 99 - Stage 4: EXECUTE 100 - - 8/16-bit ALU 101 - - ~500-2000 transistors depending on width and features 102 - 103 - Stage 5: TOKEN OUTPUT 104 - - Form result token with routing prefix 105 - - Inject into network 106 - - ~300 transistors 107 - ``` 108 - 109 - Pipeline registers between stages: ~500 transistors 110 - Control logic (state machine, handshaking): ~500-1000 transistors 111 - 112 - **Per-PE total: ~5-8K transistors of logic + SRAM chips** 113 - 114 - ### Transistor Budget Estimate (4-PE system) 115 - 116 - | Component | Transistors | 117 - |-----------|------------| 118 - | 4x PE logic | 20-32K | 119 - | Routing network (4 PEs) | 2-3K | 120 - | Bootstrap/loader microsequencer | 1-2K | 121 - | **Total logic** | **~25-35K** | 122 - | SRAM chips (instruction mem, matching stores, token queues) | 8-16 chips | 123 - 124 - ### Bootstrap / Program Loading 125 - - Hardwired microsequencer (NOT a full CPU) 126 - - Receives serial data, writes to instruction memory and routing tables via dedicated configuration bus 127 - - Config bus is separate from the token network 128 - - ROM-based state machine + UART receiver + bus master interface 129 - - ~20-30 TTL chips estimated 130 - - Issues "start" signal to release token flow 131 - - Alternative: a PE hardwired to run a built-in "loader" program from ROM 132 - 133 - ### Interrupt Handling 134 - - ISRs are subgraphs in the dataflow program, compiled and mapped to specific PEs like any other code 135 - - Compiler designates which PE(s) handle which interrupts 136 - - Hardware cost: edge detector on I/O pin, gated into token input FIFO of the assigned PE 137 - - Interrupt token injected into FIFO — PE doesn't need special hardware, just sees a token arrive 138 - - Priority: interrupt tokens can jump the FIFO queue (~3 extra chips) 139 - - ISR runs *concurrently* with main program on its reserved PEs — no context switch 140 - - Main program has nodes waiting for "interrupt result" tokens 141 - - Trade-off: reserved ISR PEs sit idle when no interrupts pending 142 - - Scalable: compile-time assignment means you can have multiple ISR PEs for different interrupt sources 143 - 144 - ### IPC / Performance Expectations 145 - - "Superscalar" is the wrong term for dataflow — there's no single instruction stream 146 - - With 4 PEs and single-cycle matching (common case), peak is 4 ops/clock 147 - - Realistic sustained throughput depends on: 148 - - Network crossing frequency (adds routing latency) 149 - - Hash path hits vs direct index (matching latency) 150 - - Available parallelism in the program 151 - - Parallel workloads (matrix multiply, FFT): near peak 152 - - Sequential/pointer-chasing code: ~0.5-1 ops/clock (still competitive with 6502) 153 - - Key insight: matching store performance is the primary bottleneck, as Manchester discovered 154 - 155 - ## Open Questions / Next Steps 156 - 157 - 1. **Matching store SRAM addressing scheme** — detailed design of direct-indexed + hash fallback, including bump allocator hardware 158 - 2. **Context slot sizing** — how many concurrent contexts per PE? determines SRAM requirements 159 - 3. **Instruction encoding** — what operations, what format, how wide 160 - 4. **Routing network topology** — exact interconnect for 4-8 PEs 161 - 5. **Compiler / assembler** — even a basic assembler for hand-written dataflow assembly 162 - 6. **Throttle mechanism** — how to limit concurrent activations to prevent matching store overflow 163 - 7. **Deallocation** — hardware mechanism for freeing context slots when activations complete 164 - 8. **v0 milestone** — single PE + loader, load and execute fibonacci or similar over serial 165 - 166 - ## Key Papers in Project 167 - 168 - - `gurd1985.pdf` — Manchester Dataflow Machine (matching unit details, overflow, pipeline) 169 - - `Dataflow_Machine_Architecture.pdf` — Veen survey (comprehensive overview, matching space analysis) 170 - - `amamiya1982.pdf` — DFM architecture (semi-CAM, structure memory, TTL prototype) 171 - - `17407_17358.pdf` — DFM evaluation (implementation details, benchmarks, VLSI projection) 172 - - `efficienthardwarearchitectureforfastipaddresslookup.pdf` — Pao et al. (binary-trie partitioning, bit-vector parallel search, SRAM pipeline) 173 - - `mclaughlin2005.pdf` — IP lookup survey (comparison of trie vs hash approaches in hardware) 174 - - `HighperformanceIPlookupcircuitusingDDRSDRAM.pdf` — Yang et al. (hash + CAM overflow, DDR burst for multi-bank) 175 - - `NonStrict_Execution_in_Parallel_and_Distributed_C.pdf` — non-strict execution, split-phase memory 176 - - `NATLS219821.pdf` — National Semiconductor 100142 CAM chip (4x4-bit, reference for discrete CAM scale) 177 - - `MOSES071271.pdf` — (in project, not yet examined) 178 - - `yuba1983.pdf` — (in project, not yet examined)

-362

design-notes/versions/design-alternatives.md

··· 1 - # Dynamic Dataflow CPU — Design Alternatives & Roads Not Travelled 2 - 3 - Companion document to the architecture docs. Captures rejected, deferred, 4 - and superseded approaches, their advantages, disadvantages, and why we went 5 - the way we did. 6 - 7 - Updated to reflect decisions from ongoing design discussions. 8 - 9 - --- 10 - 11 - ## 1. Routing Network Topology 12 - 13 - ### Chosen: Hierarchical Prefix-Based Routing (target architecture) 14 - ### v0 Implementation: Shared Bus with Type-Based Routing 15 - 16 - For 4 PEs + 1-2 SMs + I/O controller, a shared pipelined bus with latches 17 - is sufficient. type field in the packet header is the primary routing 18 - discriminator. prefix routing is the target for scaling but doesn't need 19 - to be built until Phase 3+. 20 - 21 - ### Alternative A: Manchester-Style Omega / Sorting Network 22 - - **How it works**: log2(n) stages of 2x2 routing elements. Every token 23 - traverses all stages. Destination bits are consumed one per stage. 24 - - **Advantages**: 25 - - Maximally general: any-to-any routing in fixed time 26 - - No routing tables to configure — topology IS the routing algorithm 27 - - Well-understood, proven in Manchester hardware 28 - - **Disadvantages**: 29 - - Fixed latency regardless of distance (the "DRAM from the moon" problem) 30 - - Latency grows with PE count even for local traffic 31 - - All tokens pay full traversal cost — devastating for locality-heavy programs 32 - - Hardware grows as n * log2(n) routing elements 33 - - **Why rejected**: our design explicitly exploits compiler-assigned locality. 34 - paying full network traversal for a token going to the PE next door is 35 - wasteful. hierarchical routing makes the common case fast. 36 - 37 - ### Alternative B: Crossbar 38 - - **How it works**: full n*n switch. any source to any destination in one cycle. 39 - - **Advantages**: 40 - - Minimum latency: everything is 1 hop 41 - - Simple conceptually 42 - - **Disadvantages**: 43 - - Hardware grows as n^2. 4 PEs = 16 crosspoints, fine. 8 PEs = 64. 44 - - Each crosspoint needs a mux + arbiter. gets expensive fast. 45 - - Contention handling needs buffering or stalling 46 - - **Why rejected**: doesn't scale. fine for 4 PEs but we want the architecture 47 - to extend beyond that. could revisit as a LOCAL interconnect within a 48 - cluster, with hierarchical routing between clusters. 49 - 50 - ### Alternative C: Ring Bus 51 - - **How it works**: tokens travel around a ring, each node inspects and either 52 - consumes or passes through. 53 - - **Advantages**: 54 - - Dead simple hardware: each node is a register + comparator + mux 55 - - Trivially extensible: add a node, extend the ring 56 - - **Disadvantages**: 57 - - Worst-case latency is n-1 hops 58 - - Average latency grows linearly with PE count 59 - - Bandwidth shared: total ring bandwidth is fixed regardless of PE count 60 - - **Status**: not rejected, just unnecessary at v0 scale. **worth 61 - reconsidering** as an intermediate step between shared bus and full 62 - prefix routing if the system grows to 8-16 PEs. 63 - 64 - ### Alternative D: Shared Bus (chosen for v0) 65 - - **Advantages**: 66 - - Absolute minimum hardware 67 - - Trivially simple 68 - - With pipelined latches, multiple packets in flight 69 - - **Disadvantages**: 70 - - Bandwidth limited 71 - - Doesn't scale past ~4-8 nodes 72 - - **Status**: v0 physical implementation. token format is designed for the 73 - prefix-routed future, but physical wires are shared bus. the type field 74 - provides a natural decomposition path — CN/AN/DN can be split onto 75 - separate physical paths when contention shows up. 76 - 77 - --- 78 - 79 - ## 2. Token Format 80 - 81 - ### Chosen: Type-Tagged 32-bit Tokens (4 types) 82 - - 2-bit type field as primary routing discriminator 83 - - Type 11 subdivided by 2-bit subtype for I/O, config, and future system 84 - management traffic 85 - - See `architecture-overview.md` for full format specification 86 - 87 - No changes to alternatives from previous version. the type-11 subtype 88 - scheme was added to handle I/O and config writes without consuming PE 89 - or SM address space. 90 - 91 - ### Alternative A: Fixed-Field Flat Token 92 - - **Why rejected**: wastes bits on monadic and structure tokens. type-tagged 93 - approach reclaims 6+ bits. decoding cost is trivial. 94 - 95 - ### Alternative B: 36-bit Bus 96 - - **Status**: still the escape hatch if 14-bit dyadic data is too limiting. 97 - decision deferred. 98 - 99 - ### Alternative C: Variable-Width Tokens 100 - - **Why rejected**: complexity cost too high, slows common case. multi-flit 101 - used only for type-11 extended operations (rare). 102 - 103 - --- 104 - 105 - ## 3. Matching Store Architecture 106 - 107 - ### Chosen: Direct-Indexed Context Slots (Amamiya semi-CAM) + Hash Fallback 108 - No changes. see `pe-design.md` for detailed design. 109 - 110 - ### Alternative A: Pure Hashing (Manchester-Style) 111 - - **Why rejected**: <20% memory utilisation, 16 parallel banks per PE, 112 - overflow subsystem. too much hardware for the benefit. the semi-CAM 113 - approach gives single-cycle matching for the common case. 114 - 115 - ### Alternative B: Full CAM 116 - - **Why rejected**: discrete CAM chips are tiny (4x4 bits) or expensive. 117 - can't practically build a matching store out of them at needed scale. 118 - 119 - ### Alternative C: Software Matching (in the PE pipeline) 120 - - **Why rejected**: turns every dyadic operation into a multi-cycle search. 121 - destroys throughput. the whole point is hardware matching. 122 - 123 - ### FPGA Prototyping (recommended) 124 - Before committing to a TTL matching store, prototype in a small FPGA 125 - (iCE40, etc.). validate the addressing scheme, test with real token 126 - streams, measure collision rates. doesn't compromise the "discrete logic" 127 - goal — it's a prototyping step. **strongly recommended** before building 128 - boards. 129 - 130 - --- 131 - 132 - ## 4. I/O Architecture 133 - 134 - ### Chosen: I/O Controller on Type-11 System Channel 135 - 136 - **UPDATED**: previous design considered SM-mapped I/O (I/O registers at 137 - specific SM addresses). this has been superseded by a dedicated I/O 138 - controller that lives on the type-11 system channel. 139 - 140 - See `io-and-bootstrap.md` for full design. 141 - 142 - ### Alternative A: I/O as SM Bank (superseded) 143 - - **How it works**: an SM bank that isn't memory — it's a UART/SPI/etc. 144 - behind the same AN/DN interface. CM issues type-10 READ/WRITE to 145 - specific SM addresses and gets I/O responses. 146 - - **Advantages**: 147 - - Zero new architecture. CMs already talk to SM banks. 148 - - Simple mental model (memory-mapped I/O, just like everyone else) 149 - - **Disadvantages**: 150 - - Burns an SM_id slot (2-bit field, only 4 banks). real cost. 151 - - Semantic mismatch: SM operations are stateless request/response, 152 - but I/O often needs unsolicited events (UART RX) 153 - - An SM bank can't spontaneously generate tokens — it can only respond 154 - to requests. this means no interrupt equivalent, only polling. 155 - - Shoehorning I/O config (baud rate, etc.) into SM address space is 156 - awkward 157 - - **Why superseded**: the type-11 I/O controller approach gives a free 158 - packet format (not constrained by SM token layout), supports 159 - unsolicited token generation (dataflow-native interrupts), and doesn't 160 - consume SM address space. strictly better. 161 - 162 - ### Alternative B: I/O as Dedicated PE 163 - - **How it works**: a "PE" that isn't a real PE — it's an I/O controller 164 - with a CN network interface. receives type-00/01 tokens, interprets 165 - them as I/O commands. 166 - - **Advantages**: 167 - - I/O operations are function calls in the dataflow graph 168 - - Can generate unsolicited tokens (like the type-11 approach) 169 - - **Disadvantages**: 170 - - Consumes a PE address slot (2-bit PE_id, only 4 slots) 171 - - Token format is constrained by CM token layout (context, offset, 172 - port fields repurposed for I/O semantics — awkward fit) 173 - - **Why superseded**: type-11 approach gives the same benefits (network 174 - participant, unsolicited token generation) without consuming a PE 175 - address slot, and with a completely free packet format for I/O. 176 - 177 - ### Alternative C: Polling via SM 178 - - **How it works**: a PE periodically issues SM reads to check I/O status. 179 - no hardware interrupt mechanism at all. 180 - - **Advantages**: 181 - - Zero additional hardware 182 - - Entirely in the dataflow paradigm (it's just a program) 183 - - Deterministic timing 184 - - **Disadvantages**: 185 - - Latency = polling interval (potentially very high) 186 - - Wastes PE cycles on polling when no event pending 187 - - **Status**: rejected as general-purpose I/O. but **could coexist** with 188 - the type-11 I/O controller for very low-priority status checks. 189 - 190 - --- 191 - 192 - ## 5. Separate Communication Networks (CN/AN/DN) 193 - 194 - ### Chosen: Shared Physical Bus for v0, Logically Separate 195 - 196 - **UPDATED**: the Amamiya architecture has physically separate CN, AN, and 197 - DN. we're sharing a physical bus for v0 but maintaining the logical 198 - separation via the type field. 199 - 200 - The type field provides a clean decomposition path: 201 - - When SM access contention becomes measurable, split type-10 traffic 202 - onto a dedicated AN/DN bus 203 - - CN (types 00/01) and system (type 11) stay on the original bus 204 - - Further splits as needed 205 - 206 - This is a topology change, not a protocol change. no module interfaces 207 - change when the bus is split. 208 - 209 - ### Alternative: Physically Separate from Day One 210 - - **Advantages**: 211 - - No contention between traffic classes 212 - - Closer to Amamiya's proven architecture 213 - - **Disadvantages**: 214 - - 3x the bus wiring, routing logic, and board area for v0 215 - - At 4 PEs, contention is unlikely to be the bottleneck 216 - - Premature optimisation 217 - - **Why deferred**: build it when the measurements say you need it. 218 - 219 - --- 220 - 221 - ## 6. Interrupt Handling 222 - 223 - ### Chosen: Unsolicited Token Injection from I/O Controller 224 - 225 - **UPDATED**: previous design had interrupt tokens injected directly into 226 - PE input FIFOs via hardware edge detectors on I/O pins. this is superseded 227 - by the I/O controller model where the controller generates and injects 228 - tokens onto the network. 229 - 230 - Advantages over the previous approach: 231 - - No per-PE interrupt hardware needed 232 - - I/O controller centralises all external event handling 233 - - Destination PE is configurable, not hardwired 234 - - Same mechanism works for all I/O devices 235 - 236 - See `io-and-bootstrap.md` for the unsolicited token generation model. 237 - 238 - ### Previous Approach: Direct Interrupt Token Injection 239 - - Edge detector on I/O pin, gated into specific PE's input FIFO 240 - - Compiler designates which PE handles which interrupts 241 - - Hardware cost: edge detector + FIFO priority injection per interrupt 242 - source per PE 243 - - **Why superseded**: works, but ties interrupt handling to specific 244 - physical PE pins. the I/O controller model is more flexible and 245 - doesn't require any per-PE interrupt hardware. 246 - 247 - ### Alternative: Conventional Control Core for ISR 248 - - **Why rejected**: explicitly a non-goal to depend on a conventional 249 - control core for runtime operations. 250 - 251 - ### Alternative: Priority Routing for Interrupt Tokens 252 - - **Why rejected for v0**: over-engineered. the I/O controller approach 253 - provides adequate interrupt latency. priority can be added later 254 - (a priority bit + FIFO bypass is ~3 chips per routing node). 255 - 256 - --- 257 - 258 - ## 7. Bootstrap / Program Loading 259 - 260 - ### Chosen: Layered Approach (microcontroller -> I/O controller) 261 - 262 - **UPDATED**: previous design specified a dedicated hardwired microsequencer 263 - with a separate config bus. this has been superseded by a layered approach: 264 - 265 - - **Phase 0-2**: external microcontroller as test fixture and bootstrap source 266 - - **Phase 4+**: I/O controller handles bootstrap via type-11 config writes 267 - 268 - No separate config bus. bootstrap traffic travels the normal network as 269 - type-11 subtype-01 packets. this eliminates a dedicated bus and means the 270 - bootstrap path is also the runtime reprogramming path. 271 - 272 - See `io-and-bootstrap.md` for the full bootstrap sequence. 273 - 274 - ### Previous Approach: Hardwired Microsequencer + Config Bus (superseded) 275 - - ROM state machine + UART + dedicated config bus 276 - - ~20-30 TTL chips 277 - - **Why superseded**: the type-11 config write mechanism eliminates the 278 - need for a separate config bus. the I/O controller (or external 279 - microcontroller during development) injects config writes onto the 280 - normal network. simpler architecture, fewer buses, and the same 281 - mechanism enables runtime reprogramming. 282 - 283 - ### Alternative A: Bootstrap PE (hardwired to run loader from ROM) 284 - - **Status**: deferred but not rejected. the I/O controller bootstrap 285 - model is essentially a simplified version of this — a fixed-function 286 - device that reads from storage and emits config writes. evolving the 287 - I/O controller toward a full PE with boot ROM is a natural future step. 288 - the architecture doesn't prevent it. 289 - 290 - ### Alternative B: External Host (6502, Z80, RP2040, etc.) 291 - - **Status**: the RP2040/Arduino IS the external host during Phase 0-2. 292 - it's a development tool, not part of the architecture. the long-term 293 - goal remains self-hosted bootstrap via the I/O controller. 294 - 295 - --- 296 - 297 - ## 8. Data Width 298 - 299 - No changes from previous version. 16-bit data, 32-bit bus, 36-bit escape 300 - hatch if needed. 301 - 302 - --- 303 - 304 - ## 9. Clocking 305 - 306 - ### Chosen: Globally Synchronous with Gated Clocks (v0), Async Design Space Preserved 307 - 308 - **NEW SECTION**: see `network-and-communication.md` for full details. 309 - 310 - Three options were considered: 311 - 312 - **Option A (chosen for v0): Globally synchronous, locally gated.** 313 - One master clock, stages stall independently via gated clocks. simplest 314 - TTL implementation. 315 - 316 - **Option B: Mesochronous.** Same frequency, no phase alignment. dual-clock 317 - FIFOs at boundaries. more complex, not needed at v0 scale. 318 - 319 - **Option C: Fully asynchronous.** No global clock, request/acknowledge 320 - handshaking everywhere. theoretically ideal for dataflow (fast paths go 321 - fast, slow paths don't hold things up). but designing and debugging async 322 - TTL is painful. 323 - 324 - The architecture preserves Option C by mandating ready/valid handshaking 325 - at every inter-module boundary and FIFOs at every domain crossing. under 326 - Option A, FIFOs are simple circular buffers sharing a clock. under Option 327 - C, FIFO internals change to async circuits but the interface signals are 328 - identical. no module redesign required. 329 - 330 - The inter-PE network is the highest-value target for early async adoption, 331 - even while PEs themselves stay synchronous. 332 - 333 - --- 334 - 335 - ## 10. Miscellaneous Ideas Not Yet Integrated 336 - 337 - ### SM as Coprocessor for Complex Operations 338 - - SM could handle matrix multiply, FFT butterfly, etc. by embedding 339 - specialised functional units alongside data. 340 - - Very Amamiya-inspired (he embedded list operators in structure memory). 341 - - Deferred: v0 SM has only read/write/fetch-and-add/CAS. 342 - 343 - ### 4x4 CAM Chips (100142) for Small Associative Lookups 344 - - Too tiny for matching store (4 words x 4 bits per chip) 345 - - Potentially useful for routing table entries at network nodes (4-8 entries) 346 - - Keep in the parts bin. don't design around them. 347 - 348 - ### Compile-Time Token Route Scheduling 349 - - Partially static routing: compiler pre-computes common routes, configures 350 - routing tables. network still handles dynamic routing for runtime-generated 351 - tokens. 352 - - This is essentially what we've landed on. noted for clarity. 353 - 354 - ### Instruction Memory as Write-Back Cache 355 - - Future idea: if instruction memory is writable at runtime, could it 356 - function as a write-back cache for a larger backing store? PE fetches 357 - function bodies on demand from SM or flash, caches them in local 358 - instruction SRAM. evicts on capacity pressure. 359 - - Very speculative. would require significant additional hardware (tag 360 - memory, eviction logic, demand-fetch state machine). probably not 361 - worth it for v0-v4. but the writable instruction memory path means 362 - the hardware foundation exists.

-274

design-notes/versions/io-and-bootstrap.md

··· 1 - # Dynamic Dataflow CPU — I/O & Bootstrap 2 - 3 - Covers the type-11 subsystem: I/O controller design, peripheral interface, 4 - bootstrap sequence, and the path from microcontroller-assisted bring-up to 5 - self-hosted boot. 6 - 7 - See `architecture-overview.md` for type-11 packet semantics. 8 - See `network-and-communication.md` for how the I/O controller connects to 9 - the bus. 10 - 11 - ## Type 11 Subtypes 12 - 13 - Type 11 is the "system management" channel. the 2-bit subtype field 14 - immediately after the type field discriminates traffic classes: 15 - 16 - ``` 17 - 11 + 00: I/O operation — routed to the I/O controller 18 - 11 + 01: Extended address / config write — target PE instruction memory, 19 - routing table, or other config registers 20 - 11 + 10: Reserved (future: debug/trace, DMA, performance counters) 21 - 11 + 11: Reserved 22 - ``` 23 - 24 - All type-11 traffic is low frequency relative to types 00/01/10. it is 25 - acceptable for decode and handling to take extra cycles. 26 - 27 - ## I/O Controller 28 - 29 - ### What It Is 30 - 31 - A fixed-function device on the network. NOT a PE — no matching store, no 32 - instruction memory, no ALU. it receives type-11 subtype-00 packets, 33 - interprets them as I/O commands, and responds. 34 - 35 - it is also the only network participant that can **spontaneously generate 36 - tokens** without first receiving one. this is how external events (UART 37 - RX, sensor interrupts, timer ticks) enter the dataflow graph. 38 - 39 - ### Token Format for I/O (subtype 00) 40 - 41 - 28 bits of payload after [type:2][subtype:2]. suggested allocation: 42 - 43 - ``` 44 - I/O Request (CM -> I/O controller): 45 - [type:2=11][subtype:2=00][device:3][register:4][R/W:1][data:16][pad:4] 46 - 47 - I/O Response (I/O controller -> CM): 48 - Repackaged as a normal type 00 or 01 token addressed to the requesting CM. 49 - Return routing must be provided somewhere — either in the request's 50 - padding/data field (for reads), or preconfigured in the I/O controller. 51 - ``` 52 - 53 - **Open question**: return routing for I/O reads. options: 54 - - (a) I/O read requests carry return routing in the data field (same 55 - pattern as SM READ requests — data field is unused on reads) 56 - - (b) I/O controller has a preconfigured "return to" address per device 57 - (simpler requests, but less flexible) 58 - - (c) I/O controller always returns to a fixed "I/O result handler" node 59 - in a designated PE (simplest, but rigid) 60 - 61 - option (a) is most consistent with how SM works. likely the right call. 62 - 63 - ### Hardware 64 - 65 - ``` 66 - Network Interface 67 - (receive type-11 subtype-00 packets) 68 - | 69 - v 70 - [Input FIFO] 71 - | 72 - v 73 - [Subtype Check] -- not subtype 00? --> discard or forward 74 - | 75 - v 76 - [Device/Register Decode] --- EEPROM or small logic 77 - | 78 - +---> [UART chip (6850/16550/etc.)] 79 - | - TX data register 80 - | - RX data register 81 - | - Status register 82 - | - Baud/config registers 83 - | 84 - +---> [future: SPI, GPIO, timer, etc.] 85 - | 86 - v 87 - [Result Formatter] --- constructs type 00/01 return token 88 - | 89 - v 90 - [Output FIFO] 91 - | 92 - v 93 - Network (token injected as type 00/01) 94 - ``` 95 - 96 - Estimated hardware: ~15-25 TTL chips + UART chip. comparable to the 97 - microsequencer it replaces, but architecturally integrated. 98 - 99 - ### Unsolicited Token Generation (Interrupt Equivalent) 100 - 101 - When an external event occurs (e.g., UART receives a byte), the I/O 102 - controller generates a token and injects it onto the network. from the 103 - receiving CM's perspective, data just arrived — exactly like any other 104 - token. no interrupt hardware needed on the CM side. 105 - 106 - The destination for unsolicited tokens is preconfigured: either hardcoded 107 - in the I/O controller's EEPROM, or set at bootstrap via a type-11 config 108 - write to the I/O controller itself. "when UART RX fires, send the byte 109 - to PE 2, offset 0x10, context slot 3, port 0." 110 - 111 - This is the **dataflow-native interrupt model**: external events are 112 - token sources. they feed into the dataflow graph at designated entry 113 - points. the receiving PE doesn't need to do anything special — it just 114 - sees a token arrive and processes it like any other. 115 - 116 - Implications: 117 - - the I/O controller is a **source node** in the dataflow graph 118 - - it breaks the invariant that "tokens are only produced in response to 119 - other tokens" — external reality leaks in here 120 - - the network must accept tokens from the I/O controller even when no 121 - request was sent (the I/O controller's output FIFO can fill independently) 122 - - if the destination PE's input FIFO is full, backpressure propagates 123 - to the I/O controller. UART RX bytes could be lost if the system can't 124 - keep up. the I/O controller should have a small internal buffer 125 - (or the UART chip's built-in FIFO handles this). 126 - 127 - ## Config Writes (subtype 01) 128 - 129 - ### Purpose 130 - 131 - Type-11 subtype-01 packets write to PE instruction memory, routing tables, 132 - or other configuration state. they are the mechanism for: 133 - 134 - 1. Bootstrap program loading 135 - 2. Runtime reprogramming (future) 136 - 3. Routing table configuration 137 - 138 - ### Packet Format 139 - 140 - ``` 141 - Config Write: 142 - [type:2=11][subtype:2=01][target_PE:2][target_addr:10][data:16] 143 - 144 - For wider addresses or multi-word writes, use two flits: 145 - Flit 1: [type:2=11][subtype:2=01][target_PE:2][flags:2][addr_hi:8][addr_lo:8][pad:8] 146 - Flit 2: [data:16][...additional fields as needed...] 147 - ``` 148 - 149 - **Open question**: exact bit allocation depends on how wide instruction 150 - memory addresses need to be and how wide instruction words are. if 151 - instruction words are wider than 16 bits, config writes are necessarily 152 - multi-flit. 153 - 154 - ### Routing 155 - 156 - Config writes are addressed to a specific PE by the target_PE field. the 157 - routing network delivers them like any other token — type 11 is inspected 158 - by routing nodes only to the extent of "this is not type 00/01/10, forward 159 - toward the target." the target PE recognises the subtype-01 packet and 160 - routes it to the instruction memory write port (see `pe-design.md`). 161 - 162 - Routing tables themselves can be written via config writes. the target is 163 - a routing node, not a PE. routing nodes need a small amount of config 164 - write handling: recognise "this config write is for me" (based on node ID) 165 - and update the local prefix table. during bootstrap, routing nodes are in 166 - default mode (fixed-address routing), so config writes reach them reliably 167 - without needing configured routing. 168 - 169 - ## Bootstrap Sequence 170 - 171 - ### Development / Early Prototyping 172 - 173 - For Phase 0-2, an external microcontroller (RP2040, Arduino) acts as the 174 - bootstrap source. it is NOT part of the architecture — it's a test fixture. 175 - 176 - The microcontroller: 177 - 1. Formats type-11 subtype-01 packets (config writes) 178 - 2. Injects them into the network (via a dedicated injection port or by 179 - bit-banging the bus interface) 180 - 3. Writes instruction words to each PE's instruction memory 181 - 4. Optionally writes routing table entries to routing nodes 182 - 5. Optionally writes initial SM contents via type-10 packets 183 - 6. Injects seed token(s) — type 00/01 packets that kick off execution 184 - 7. Releases the bus (goes high-impedance or disconnects) 185 - 186 - This lets PE and SM hardware be tested without any of the I/O controller 187 - or bootstrap logic existing. the microcontroller is the bootstrap, the 188 - debug interface, and the test harness all in one. 189 - 190 - ### Self-Hosted Bootstrap (Phase 4+) 191 - 192 - The I/O controller replaces the microcontroller as the bootstrap source: 193 - 194 - 1. On reset, the I/O controller enters bootstrap mode 195 - 2. It reads program data from a connected flash/EEPROM (via SPI or 196 - parallel interface) or receives it over UART from an external host 197 - 3. It formats config write packets (type-11 subtype-01) and injects them 198 - onto the network 199 - 4. Each PE receives config writes and loads its instruction memory 200 - 5. Routing tables are configured via config writes to routing nodes 201 - 6. I/O controller injects seed token(s) to start execution 202 - 7. I/O controller transitions to normal mode (handling I/O requests) 203 - 204 - The I/O controller's bootstrap logic is a state machine, likely driven 205 - by a small ROM or EEPROM. it doesn't need to be a general-purpose 206 - processor — it just sequences reads from storage and formats them as 207 - config writes. 208 - 209 - ### Chicken-and-Egg: Routing During Bootstrap 210 - 211 - During bootstrap, routing tables are not yet configured. the network uses 212 - fixed-address default routing (see `network-and-communication.md`): 213 - 214 - - Each PE has a unique ID (EEPROM / DIP switches) 215 - - Routing nodes forward by PE_id without consulting tables 216 - - At v0 scale (shared bus), this is trivially true — everything sees 217 - everything 218 - - At larger scale, default routing must be sufficient to reach all PEs 219 - from the bootstrap source. this constrains the physical topology 220 - (bootstrap source must be topologically reachable from all PEs via 221 - default forwarding). 222 - 223 - The I/O controller's own ID and the default routing path to it are 224 - hardwired or EEPROM-configured. it doesn't depend on routing tables. 225 - 226 - ### Seed Token Injection 227 - 228 - After program loading, the I/O controller (or microcontroller) injects 229 - one or more seed tokens to start execution. these are normal type 00/01 230 - tokens addressed to the entry point(s) of the loaded program. 231 - 232 - For a simple program with one entry point: one seed token to one PE. 233 - For a program with multiple independent entry points (e.g., main program 234 - + I/O handler): multiple seed tokens to different PEs. 235 - 236 - The seed tokens are part of the program image — the compiler specifies 237 - "to start this program, inject these tokens." the bootstrap loader reads 238 - them from the program image and injects them after loading is complete. 239 - 240 - ## Layering Summary 241 - 242 - The I/O and bootstrap design is explicitly layered for incremental 243 - development: 244 - 245 - | Phase | Bootstrap Source | I/O | Network Config | 246 - |-------|-----------------|-----|----------------| 247 - | 0-1 | Microcontroller (external) | None | Direct SRAM programming | 248 - | 2 | Microcontroller via type-11 | None | Config writes on bus | 249 - | 3 | Microcontroller via type-11 | Polling via SM (optional) | Config writes | 250 - | 4 | I/O controller (self-hosted) | I/O controller (type 11) | Config writes from I/O controller | 251 - 252 - Each phase adds capability without redesigning previous work. the key 253 - enabler is that config writes (type-11 subtype-01) work the same whether 254 - they come from a microcontroller or the I/O controller. the network 255 - doesn't know or care about the source. 256 - 257 - ## Open Design Questions 258 - 259 - 1. **I/O return routing** — option (a), (b), or (c) from above? 260 - 2. **Unsolicited token destination config** — hardcoded or runtime- 261 - configurable? if configurable, via what mechanism? (probably a 262 - type-11 config write to the I/O controller itself) 263 - 3. **I/O controller bootstrap ROM** — how big? what's in it? just a 264 - state machine for "read flash, emit config writes" or something more? 265 - 4. **Flash/EEPROM interface** — SPI? parallel? what storage device? 266 - 5. **Program image format** — what does the compiler output? a stream of 267 - (target_PE, address, instruction_word) tuples? plus seed tokens at 268 - the end? 269 - 6. **Multiple I/O devices** — how does the device field in the I/O token 270 - scale? 3 bits = 8 devices. enough? 271 - 7. **I/O controller as bootstrap PE** — at what point (if ever) does it 272 - make sense to make the I/O controller a full PE with a boot ROM 273 - instead of fixed-function? probably not for v0-v4, but worth keeping 274 - in mind architecturally.

-225

design-notes/versions/network-and-communication.md

··· 1 - # Dynamic Dataflow CPU — Network & Communication 2 - 3 - Covers the interconnect between CMs, SMs, and the I/O subsystem. Routing 4 - topology, clocking discipline, handshaking protocols, and the scaling path 5 - from shared bus to split networks. 6 - 7 - See `architecture-overview.md` for the token format and type field semantics 8 - that drive routing decisions. 9 - 10 - ## Logical Networks 11 - 12 - Three logical traffic classes, distinguished by the type field in the 13 - 32-bit token packet: 14 - 15 - | Network | Direction | Token Types | Traffic | 16 - |---------|-----------|-------------|---------| 17 - | CN | CM <-> CM | 00 (dyadic), 01 (monadic) | operand tokens between PEs | 18 - | AN | CM -> SM | 10 (structure) | memory operation requests | 19 - | DN | SM -> CM | (results repackaged as 00/01) | memory operation results | 20 - | System | any <-> I/O, any -> any (config) | 11 (system) | I/O, config writes, future debug | 21 - 22 - DN traffic is interesting: SM produces results that are repackaged as 23 - type 00 or 01 tokens destined for the requesting CM. so from a routing 24 - perspective, DN results look like CN traffic once they leave the SM. the 25 - SM result formatter handles this — it extracts return routing from the 26 - original request and constructs a properly-typed token. 27 - 28 - ## Physical Implementation: v0 29 - 30 - ### Shared Bus with Pipelined Latches 31 - 32 - For 4 PEs + 1-2 SMs + I/O controller (~6-7 nodes), all traffic shares a 33 - single physical 32-bit bus. routing nodes inspect the type field and 34 - forward to the appropriate destination: 35 - 36 - - Types 00/01: route by PE_id field to destination CM 37 - - Type 10: route by SM_id field to destination SM bank 38 - - Type 11: route to I/O controller (subtype 00) or to target PE's config 39 - input (subtype 01) 40 - 41 - Multiple packets can be in flight simultaneously. each hop through a 42 - routing node takes one cycle, and latches at each stage hold the packet. 43 - with 2-3 hop maximum paths, 2-3 packets can be in transit concurrently. 44 - 45 - Bus arbitration: if multiple sources want to inject a packet in the same 46 - cycle, priority logic or round-robin selects one and the others assert 47 - backpressure (their output latch stays full, which stalls their pipeline 48 - via the ready/valid protocol). 49 - 50 - Hardware estimate: ~2-3K transistors for the routing network at this 51 - scale. each routing node is essentially a comparator on the type/destination 52 - fields + a mux + a latch. 53 - 54 - ### Scaling Path: Split Networks 55 - 56 - When (if) contention becomes measurable, the first split is to separate 57 - the AN/DN from the CN: 58 - 59 - - CN carries types 00, 01, and 11 (CM-to-CM + system traffic) 60 - - AN/DN carry type 10 (SM traffic) on a dedicated path 61 - 62 - This is a topology change, not a protocol change. no module interfaces 63 - change. routing nodes on each network simply stop seeing the traffic types 64 - that moved to the other network. 65 - 66 - Further splits (dedicated type-11 system bus, per-SM AN/DN paths) follow 67 - the same pattern. the type field makes this incrementally decomposable. 68 - 69 - ### Fixed-Address Bootstrap Routing 70 - 71 - During bootstrap, before routing tables are configured, all PEs are 72 - reachable via fixed physical addresses. each PE has a unique ID 73 - (set via EEPROM or DIP switches — see `pe-design.md`). routing nodes 74 - use a hardwired default mode where they route purely by PE ID without 75 - consulting lookup tables. 76 - 77 - Two approaches, not mutually exclusive: 78 - 79 - **Default routing mode**: each routing node has a "configured" bit. 80 - when unset (power-on default), the node routes by simple PE_id matching — 81 - if the destination is on this node's port, deliver; otherwise, forward. 82 - once routing tables are loaded via type-11 config writes, the 83 - "configured" bit is set and the node switches to table-based routing. 84 - hardware cost: one bit + one mux per routing node. 85 - 86 - **Flat addressing**: at 4 PEs on a shared bus, every node can see every 87 - packet anyway. destination PE_id in the packet header is sufficient. 88 - hierarchical prefix routing is irrelevant until the network topology has 89 - multiple levels. for v0, "flat addressing" is just how shared buses work. 90 - 91 - ## Routing Topology (Multi-PE) 92 - 93 - ### Hierarchical Prefix-Based Routing 94 - 95 - NOT Manchester-style omega network. prefix routing gives variable latency: 96 - local = 1 hop, cross-cluster = 2-3 hops. average latency depends on 97 - program locality, which the compiler can optimise. 98 - 99 - - Top bits of PE_id select cluster, lower bits select within cluster 100 - - Each routing node has a small prefix lookup table, configured at 101 - program load time (via type-11 config writes) 102 - - Pao's bitwise AND trick potentially useful for routing decisions or 103 - small associative lookups at routing nodes 104 - 105 - This topology doesn't need to be built until Phase 3+. the token format 106 - supports it already. see `design-alternatives.md` for comparison with 107 - omega, crossbar, ring, and shared bus approaches. 108 - 109 - ## Clocking Discipline 110 - 111 - ### Design Principle 112 - 113 - **Every inter-module boundary communicates via ready/valid handshaking. 114 - no module assumes anything about the timing of the module on the other 115 - side of a FIFO.** 116 - 117 - This is the single most important architectural constraint for preserving 118 - future design space. it enables starting with globally synchronous clocking 119 - and evolving toward partially or fully asynchronous operation without 120 - changing any module interfaces. 121 - 122 - ### v0: Globally Synchronous, Locally Gated (Option A) 123 - 124 - One master clock. each pipeline stage only advances when its input FIFO 125 - has data AND its output FIFO has space. the clock to each stage is gated: 126 - 127 - ``` 128 - stage_clock = master_clock AND input_not_empty AND output_not_full 129 - ``` 130 - 131 - Each stage is flip-flop based, everything is referenced to the same edge, 132 - but stages can stall independently. this is the simplest TTL implementation 133 - and sufficient for initial bring-up and testing. 134 - 135 - ### Future: Fully Asynchronous (Option C) 136 - 137 - No global clock. each stage signals "data ready" to the next, which signals 138 - "accepted" back (4-phase or 2-phase handshake). fast paths go fast, slow 139 - paths (SM access, cross-PE routing) take longer without holding anything 140 - else up. 141 - 142 - Designing fully async in TTL is painful (hazard-prone, harder to debug), 143 - but the architecture does NOT rule it out, provided the ready/valid 144 - discipline is maintained from the start. 145 - 146 - ### Where Async Pays Off Most: The Inter-PE Network 147 - 148 - Even under Option A (synchronous PEs), the routing network between PEs 149 - benefits from asynchronous handshaking. routing latency is variable 150 - (depends on path length and contention), which is awkward to handle in a 151 - synchronous pipeline. with async handshaking on routing nodes, tokens 152 - propagate at wire speed + gate delay and land in the destination PE's 153 - input FIFO, which synchronises to the local clock. 154 - 155 - This is a small, contained piece of async design that buys a lot. it means 156 - the inter-PE network doesn't constrain the PE clock frequency, and adding 157 - routing hops doesn't require slowing the whole system down. 158 - 159 - ### Concrete Requirements to Preserve Option C 160 - 161 - 1. **FIFO interfaces are defined as**: 162 - - Input side: `data_in`, `write_enable`, `full` 163 - - Output side: `data_out`, `read_enable`, `empty` 164 - - No clock crossing assumptions in the protocol 165 - 166 - 2. **Under Option A** (synchronous): both sides share a clock. 167 - `write_enable` / `read_enable` are gated clock enables. FIFO is a 168 - simple circular buffer. 169 - 170 - 3. **Under Option C** (async): FIFO internals become async (gray-code 171 - pointers or 4-phase handshake). interface signals are identical. 172 - nothing on either side of the FIFO changes. 173 - 174 - 4. **Never design a path where module A asserts a signal and module B is 175 - assumed to see it on the next clock edge without a FIFO or latch in 176 - between.** If this discipline is maintained, swapping to async later 177 - is a FIFO-internal change, not an architectural one. 178 - 179 - 5. **Arbitration interfaces use request/grant, not clock-phase 180 - assumptions.** This matters for instruction memory arbitration (pipeline 181 - vs network write — see `pe-design.md`) and shared bus access. a 182 - synchronous arbiter uses request/grant resolved on a clock edge. an 183 - async arbiter (Seitz mutual exclusion element) uses request/grant 184 - resolved by circuit dynamics. the interface is the same. 185 - 186 - ### SM Clock Independence 187 - 188 - SM bank access time may differ from the PE pipeline clock. the split-phase 189 - nature of SM access (request on AN, result on DN whenever ready) already 190 - accommodates this. SM can run on its own clock, or at its own speed in an 191 - async design. FIFOs at the AN input and DN output handle the domain 192 - crossing. 193 - 194 - ## Backpressure 195 - 196 - All flow control is via FIFO fullness. when a FIFO is full, the upstream 197 - module stalls: 198 - 199 - - PE output FIFO full -> PE pipeline stalls at token output stage 200 - - Routing node output latch full -> upstream routing node holds packet 201 - - SM request FIFO full -> AN stops accepting from CMs 202 - - PE input FIFO full -> network stops delivering to that PE 203 - 204 - This propagates backpressure naturally without deadlock, **provided there 205 - are no circular dependencies in the flow graph that require simultaneous 206 - forward progress on multiple paths.** in practice, dataflow graphs are 207 - DAGs (or have cycles broken by the matching store / context slot mechanism), 208 - so this is generally safe. worth verifying per-program, though. 209 - 210 - The one risk is the shared bus at v0: if all PE input FIFOs are full and 211 - all PEs are trying to send, you get global stall. this is a capacity 212 - problem (FIFOs too small or too much parallelism for the bus bandwidth), 213 - not a protocol problem. increasing FIFO depth or splitting the bus 214 - resolves it. 215 - 216 - ## Open Design Questions 217 - 218 - 1. Exact routing node logic — comparator + mux + latch, or something 219 - more sophisticated? 220 - 2. Bus arbitration policy — round-robin vs priority? priority for type-11 221 - config traffic during bootstrap? 222 - 3. FIFO depth at each boundary — 8-deep at PE input is the current sketch, 223 - what about routing node latches and SM FIFOs? 224 - 4. Async routing node prototype — worth building one async routing node 225 - early to validate the handshake protocol?

-502

design-notes/versions/pe-design(1).md

··· 1 - # Dynamic Dataflow CPU — PE (Processing Element) Design 2 - 3 - Covers the CM (Control Module) pipeline, matching store, instruction memory, 4 - context slot management, and per-PE identity. 5 - 6 - See `architecture-overview.md` for token format and module taxonomy. 7 - See `network-and-communication.md` for how tokens enter/leave the PE. 8 - 9 - ## Design Philosophy: Static Assignment, Compiler-Driven Sizing 10 - 11 - This design diverges significantly from both Manchester and Amamiya in how 12 - PEs are used. Understanding the difference is critical to understanding why 13 - the matching store can be so much smaller here. 14 - 15 - **Amamiya DFM (1982/17407 papers):** every PE has ALL function bodies 16 - pre-loaded in instruction memory (8KW, 58 bits/word per PE, identical 17 - contents across all PEs). Function *instances* are dynamically assigned to 18 - PEs at runtime by a CCU (Cluster Control Unit) that picks the least-loaded 19 - PE. The OM (operand matching memory) needs 1024 CAM blocks per PE because 20 - any function can run anywhere, and deep Lisp recursion means many 21 - simultaneous activations. The "semi-CAM" was their solution to making this 22 - affordable — instance name directly addresses a block, then 4-way 23 - set-associative lookup within the block on instruction identifier. 24 - 25 - **Manchester (Gurd 1985):** similar story but with hashing instead of 26 - semi-CAM. 16 parallel 64K-token memory banks per PE for set-associative 27 - hash lookup. 1M token capacity matching store. Plus an overflow unit 28 - (initially emulated on the host). The matching unit alone was 16 memory 29 - boards per PE. 30 - 31 - Both machines sized their matching stores for worst-case dynamic scheduling 32 - of arbitrary programs. The whole program lives in every PE (or in a single 33 - PE's matching unit), and any activation can land anywhere. That's why 34 - those matching stores are enormous. 35 - 36 - **This design:** the compiler statically assigns function bodies (or chunks 37 - of them) to specific PEs. Different PEs have different instruction memory 38 - contents. The compiler knows at compile time which functions run where, 39 - and can calculate maximum concurrent activations per PE. This means: 40 - 41 - - Instruction memory is NOT replicated — each PE only holds its assigned 42 - function bodies. IM can be much smaller. 43 - - The matching store only needs enough context slots for the maximum 44 - concurrent activations the compiler predicts for that specific PE. 45 - Not 1024. Probably 16-32. 46 - - No CCU needed for dynamic PE allocation. Scheduling decisions are 47 - made at compile time. 48 - - The tradeoff is scheduling flexibility — you can't dynamically 49 - rebalance load at runtime. The compiler must get it roughly right. 50 - 51 - ### Function Splitting Across PEs 52 - 53 - A "function" in the source language does NOT need to map 1:1 to a 54 - contiguous block on one PE. The compiler can split a function body at 55 - any data-dependency boundary. The token network doesn't know or care 56 - whether two instructions are "in the same function" — it just sees tokens 57 - with destinations. 58 - 59 - A 40-instruction function body could be split into three chunks of ~13 60 - instructions across three PEs, each chunk fitting in a smaller context 61 - slot. The "function" as the architecture sees it is really "a set of 62 - instructions that share a context slot ID on this PE." The compiler 63 - defines what that grouping means. 64 - 65 - This is a powerful lever for keeping context slots small: if a function 66 - body is too big for the slot size, the compiler splits it. The split 67 - introduces inter-PE token traffic (extra network hops), but keeps 68 - per-PE hardware simple. The compiler can optimise the split points to 69 - minimise cross-PE traffic. 70 - 71 - **Implication for context slot semantics:** a context slot doesn't mean 72 - "one function activation." It means "one chunk of work sharing a local 73 - operand namespace on this PE." Multiple context slots on different PEs 74 - might collectively represent one function activation. The token's ctx_slot 75 - field scopes operand matching to a local context, nothing more. 76 - 77 - **Implication for the compiler:** this architecture actively wants either 78 - small functions or functions distributed across PEs. The compiler is free 79 - to treat any subgraph of the dataflow graph as a "chunk" and assign it to 80 - a PE, regardless of source-level function boundaries. Loop bodies, branch 81 - arms, pipeline stages — all valid chunk boundaries. The grain of 82 - scheduling is the subgraph, not the function. 83 - 84 - ## PE Identity 85 - 86 - Each PE has a unique ID used for routing. Two mechanisms, not mutually 87 - exclusive: 88 - 89 - **EEPROM-based**: the instruction decoder EEPROM already contains 90 - per-PE truth tables. The PE ID can be encoded as additional input bits 91 - to the EEPROM, meaning the EEPROM contents are unique per PE but the 92 - circuit board is identical. The instruction decoder "knows" which PE 93 - it is because its EEPROM was burned with that ID. 94 - 95 - **DIP switches**: 3-4 switches give 8-16 PE addresses. Better for early 96 - prototyping — reconfigurable without reflashing. Can coexist with the 97 - EEPROM approach (switches provide ID bits that feed into the EEPROM 98 - address lines). 99 - 100 - The PE ID is needed in two places: 101 - 1. Input token filtering: "is this token addressed to me?" 102 - 2. Output token formatting: "set the source PE field" (if result tokens 103 - carry source info for return routing) 104 - 105 - ## PE Pipeline (5-stage sketch) 106 - 107 - ``` 108 - Stage 1: TOKEN INPUT 109 - - Receive token from network 110 - - Classify: type 00/01 (normal), type 11 subtype 01 (config write) 111 - - Normal tokens -> pipeline FIFO 112 - - Config writes -> instruction memory write port (stalls pipeline) 113 - - Buffer in small FIFO (8-deep, 32-bit) 114 - - ~1K transistors (flip-flops) or use small SRAM 115 - 116 - Stage 2: MATCH / BYPASS 117 - - Type 00 (dyadic): direct-index into context slot array 118 - - Check generation counter: mismatch = stale, discard 119 - - First operand: store in slot, advance to wait state 120 - - Second operand: read partner from slot, both proceed 121 - - Type 01 (monadic): bypass matching entirely, proceed directly 122 - - Single cycle for all cases (no hash path, no CAM search — 123 - direct indexing only, see matching store section below) 124 - - Estimated: ~200-300 transistors + SRAM 125 - 126 - Stage 3: INSTRUCTION FETCH 127 - - Use local offset to read from PE's instruction SRAM 128 - - External SRAM chip, so just address generation logic 129 - - ~200 transistors of logic 130 - - NOTE: instruction memory is shared between pipeline reads and 131 - network config writes — see "Instruction Memory" section below 132 - 133 - Stage 4: EXECUTE 134 - - 8/16-bit ALU 135 - - ~500-2000 transistors depending on width and features 136 - 137 - Stage 5: TOKEN OUTPUT 138 - - Form result token with routing prefix (type, destination PE/SM, 139 - offset, context, etc.) 140 - - Inject into network via output FIFO 141 - - ~300 transistors 142 - ``` 143 - 144 - Pipeline registers between stages: ~500 transistors 145 - Control logic (state machine, handshaking): ~500-1000 transistors 146 - 147 - **Per-PE total: ~3-5K transistors of logic + SRAM chips** 148 - 149 - (Revised down from earlier 5-8K estimate. The matching stage is dramatically 150 - simpler than originally sketched now that hash fallback is removed from the 151 - primary pipeline. See matching store section.) 152 - 153 - ## Instruction Memory 154 - 155 - ### Static Assignment, Per-PE Contents 156 - 157 - Unlike Amamiya where every PE has identical IM contents (full program), 158 - each PE here holds only the function bodies (or function chunks) assigned 159 - to it by the compiler. This means: 160 - 161 - - IM is smaller per PE (only assigned code, not the whole program) 162 - - Different PEs have different IM contents (loaded at bootstrap) 163 - - The compiler emits a per-PE instruction image as part of the program 164 - 165 - ### Runtime Writability 166 - 167 - Instruction memory is **not** read-only. It is writable from the network 168 - via type-11 subtype-01 (config/extended address) packets. This serves 169 - two purposes: 170 - 171 - 1. **Bootstrap**: loading programs before execution starts 172 - 2. **Runtime reprogramming**: loading new function bodies while other PEs 173 - continue executing (future capability, not needed for v0) 174 - 175 - Runtime writability also means instruction memory size is not a hard 176 - architectural limit — if a program needs more code than fits in one PE's 177 - IM, the runtime (or a management PE) could swap function bodies in and 178 - out. Very speculative, but the hardware path exists. 179 - 180 - ### Implementation 181 - 182 - Instruction memory is external SRAM. The PE pipeline reads from it during 183 - Stage 3 (instruction fetch). The network can write to it via config 184 - packets received at Stage 1. 185 - 186 - Shared SRAM means arbitration between two users: 187 - - Pipeline reads (instruction fetch): high frequency, performance-critical 188 - - Network writes (config): low frequency, can tolerate delay 189 - 190 - **Arbitration approach**: network writes get priority when they arrive 191 - (they're rare and bursty during bootstrap). When a config write is in 192 - progress, the pipeline stalls for one cycle at Stage 3. Hardware cost: 193 - mux on SRAM address/data buses + write-enable gating + stall signal to 194 - pipeline. Roughly 5-8 TTL chips. 195 - 196 - **Async-compatible arbitration**: defined as request/grant interface. 197 - Synchronous implementation: priority mux resolved on clock edge. Async 198 - implementation: mutual exclusion element (Seitz arbiter). Interface is 199 - the same in both cases. See `network-and-communication.md` for clocking 200 - discipline. 201 - 202 - ### EEPROM-Based Instruction Decoding 203 - 204 - The instruction decoder can be implemented as an EEPROM acting like a PLD. 205 - Input bits = instruction opcode fields + PE ID bits. Output bits = control 206 - signals for the ALU, matching store, token output formatter, etc. 207 - 208 - This gives significant flexibility: 209 - - Instruction set can be changed by reflashing the EEPROM (no board changes) 210 - - Per-PE customisation (different PEs could theoretically have different 211 - instruction subsets, though unlikely for v0) 212 - - The PE ID is "free" — it's just more EEPROM address bits 213 - 214 - ## Matching Store Design 215 - 216 - ### Why It Can Be Small 217 - 218 - The matching store is the highest-risk component in any dataflow machine. 219 - Manchester needed 16 memory boards per PE. Amamiya needed 1024 CAM blocks 220 - (32KW at 43 bits/word) per PE. Both were sized for worst-case dynamic 221 - scheduling of arbitrary programs. 222 - 223 - This design avoids that because: 224 - 225 - 1. **Static PE assignment**: the compiler knows which functions run on 226 - which PE and can calculate maximum concurrent activations per PE. 227 - 2. **Function splitting**: the compiler can split large function bodies 228 - across PEs so no single PE needs a huge context slot. 229 - 3. **Compiler-controlled slot allocation**: the compiler assigns context 230 - slot IDs at compile time for statically-known activations. Only 231 - genuinely dynamic activations (runtime-determined recursion depth) 232 - need runtime allocation. 233 - 234 - The matching store size is therefore a *compiler parameter*, not an 235 - architectural constant. The hardware provides N context slots of M entries 236 - each. The compiler must generate code that fits within those limits, 237 - splitting and scheduling accordingly. 238 - 239 - ### Architecture: Pure Direct-Indexed Context Slots 240 - 241 - No hash fallback. No CAM search. No set-associative lookup. 242 - 243 - The matching operation is: 244 - 245 - ``` 246 - SRAM_address = [ctx_slot (from token) : match_entry (from token or instr)] 247 - read SRAM at that address 248 - check generation counter: mismatch = stale, discard token 249 - 250 - if occupied bit set: 251 - -> match found, read stored operand, proceed to instruction fetch 252 - -> clear occupied bit 253 - else: 254 - -> no match, write incoming operand, set occupied bit 255 - -> token consumed, advance to next input token 256 - ``` 257 - 258 - Single cycle. Always. The only comparison is the generation counter check 259 - (2-bit XOR, trivial). There is no "miss" path that requires multi-cycle 260 - recovery, because the address is deterministic — the compiler guaranteed 261 - that ctx_slot + match_entry uniquely identifies this matching location. 262 - 263 - Hardware cost: 264 - - One SRAM chip (matching store data) 265 - - Small register file or SRAM (occupied bitmap + port flags) 266 - - Address generation: concatenate ctx_slot bits and match_entry bits (wires) 267 - - Read/write control: occupied bit check + generation compare (~3-4 chips) 268 - - Generation counter storage: 2 bits per context slot (~1 chip for 32 slots) 269 - 270 - Total matching logic per PE: **~200-300 transistors + one SRAM chip + bitmap.** 271 - Order of magnitude less than the 2-3K transistors estimated when the 272 - design included a hash fallback path. 273 - 274 - ### Context Slot Sizing 275 - 276 - The slot count (N) vs entry count (M) tradeoff maps to: 277 - - **N (slots)**: how many concurrent activations can this PE handle 278 - - **M (entries per slot)**: how many dyadic instructions per function chunk 279 - 280 - Both are compiler-controllable. More slots = more parallelism headroom. 281 - More entries per slot = bigger function bodies without splitting. The 282 - compiler balances these. 283 - 284 - **Candidate configurations (targeting clean SRAM utilisation):** 285 - 286 - ``` 287 - Config A: 32 slots x 16 entries = 512 cells 288 - - 9-bit SRAM address (5 ctx + 4 offset) 289 - - 16-bit values: 1KB exactly in one 8Kbit SRAM chip 290 - - Good concurrency headroom, smaller function chunks 291 - 292 - Config B: 16 slots x 32 entries = 512 cells 293 - - 9-bit SRAM address (4 ctx + 5 offset) 294 - - 16-bit values: 1KB exactly 295 - - Matches current 4-bit ctx_slot token format 296 - - Fewer concurrent activations, bigger function chunks 297 - 298 - Config C: 32 slots x 32 entries = 1024 cells 299 - - 10-bit SRAM address (5 ctx + 5 offset) 300 - - 16-bit values: 2KB, fits in one 16Kbit SRAM chip 301 - - Most headroom in both dimensions 302 - - Probably the sweet spot for SRAM utilisation vs headroom 303 - 304 - Config D: 64 slots x 16 entries = 1024 cells 305 - - 10-bit SRAM address (6 ctx + 4 offset) 306 - - 16-bit values: 2KB 307 - - Favours concurrency over function chunk size 308 - - 64 concurrent activations likely overkill for v0 but future-proof 309 - ``` 310 - 311 - **SRAM layout:** store 16-bit operand values in the main SRAM chip 312 - (standard 8-bit-wide chips, 2 bytes per entry, sequential access or 313 - use 16-bit-wide SRAM if available). Occupied flags + port indicators 314 - stored separately: 315 - 316 - - **Occupied bitmap**: 1 bit per cell. 512 cells = 64 bytes (trivially 317 - small — a few flip-flops or one tiny SRAM). 1024 cells = 128 bytes. 318 - - **Port indicator**: 1 bit per cell (left/right operand). Same size as 319 - occupied bitmap. Can share the same storage. 320 - - **Generation counters**: 2 bits per *slot* (not per cell). 32 slots = 321 - 8 bytes. Trivial — a small register file or a handful of flip-flops. 322 - 323 - This separation means the main SRAM stores only 16-bit values at clean 324 - power-of-two addresses. No awkward 18-bit word widths. The metadata 325 - (occupied, port, gen) is tiny and stored in dedicated fast-access 326 - registers alongside the SRAM. 327 - 328 - **Recommendation for v0**: start with Config B (16 slots x 32 entries = 329 - 1KB) to match the current 4-bit ctx_slot token field. Upgrade to Config C 330 - (32 x 32 = 2KB, needs 5-bit ctx_slot) if 16 concurrent activations 331 - proves too tight. The physical SRAM chip doesn't change between these 332 - configs — just the address generation logic. 333 - 334 - ### Instruction Address vs Matching Store Address 335 - 336 - These are NOT the same thing, and this distinction matters: 337 - 338 - - **Instruction address** (used in Stage 3): indexes into instruction 339 - memory SRAM. 7-8 bits (128-256 instructions per PE). Used by ALL 340 - token types. This is the "offset" field in the token. 341 - - **Matching store address** (used in Stage 2): indexes into matching 342 - store SRAM. Composed of [ctx_slot : match_entry]. Only used by 343 - dyadic tokens. 344 - 345 - The compiler maintains the mapping. For dyadic instructions, the 346 - instruction word in IM includes a "match_entry" field that tells the 347 - hardware which matching store entry corresponds to this instruction. 348 - 349 - This means the matching store is dense with respect to dyadic instructions 350 - — no gaps for monadic instructions. A function chunk with 20 instructions, 351 - 8 of which are dyadic, uses 8 matching store entries, not 20. 352 - 353 - **Simplest v0 approach:** the token carries the instruction memory offset 354 - (for Stage 3). The instruction word fetched in Stage 3 contains the 355 - match_entry index (for Stage 2, which already happened on the previous 356 - pipeline cycle for the previous token — or, more precisely, the match 357 - stage reads match_entry from a lookup table indexed by offset, stored 358 - alongside the instruction memory). 359 - 360 - Actually, wait — Stage 2 happens BEFORE Stage 3. So the match_entry 361 - must come from the token, not the instruction word. This means either: 362 - 363 - (a) The token carries both an instruction offset AND a match entry index. 364 - Costs token bits. May require the offset field to be split or the 365 - match_entry to be packed into unused bits. 366 - 367 - (b) The match_entry IS the instruction offset, and the instruction memory 368 - is laid out so that the offset of a dyadic instruction is also its 369 - matching store entry within the slot. This works if the compiler 370 - assigns offsets such that dyadic instruction offsets are dense (0, 1, 371 - 2, ...) and monadic instruction offsets are in a separate range. 372 - 373 - (c) A small lookup ROM/SRAM alongside the matching store maps instruction 374 - offset -> match_entry. This adds a read before the match SRAM access 375 - (serial, adds latency) or requires a second SRAM port (parallel, adds 376 - hardware). 377 - 378 - Option (b) is the simplest if the compiler can make it work. The 379 - instruction memory layout would be: dyadic instructions at offsets 0..M-1, 380 - monadic instructions at offsets M..N-1. The token's offset field directly 381 - indexes both the matching store (for dyadic) and the instruction memory 382 - (for everything). The matching store just doesn't get accessed for offsets 383 - >= M (monadic range). 384 - 385 - This constrains instruction memory layout — dyadic instructions must be 386 - packed at the low end. But the compiler controls the layout, so this is 387 - achievable. 388 - 389 - **Recommendation for v0:** option (b). Dyadic instructions packed at 390 - offsets 0..M-1 in instruction memory, monadic at M..N-1. Token offset 391 - directly serves as both instruction address and (for offsets < M) 392 - matching store entry within the context slot. Clean, no extra bits, 393 - no extra lookup, single cycle. Constraint on the compiler, not on the 394 - hardware. 395 - 396 - ### What About Overflow? 397 - 398 - If the matching store is full (all slots occupied) or a function body 399 - exceeds M dyadic instructions: 400 - 401 - **Compile-time prevention (primary strategy):** 402 - - The compiler knows the slot count and entry count 403 - - It splits functions and schedules activations to fit 404 - - If a program genuinely can't fit (unbounded recursion deeper than N 405 - slots), the compiler inserts throttling code: a token that waits for 406 - a slot to free before allowing the next recursive call 407 - - This is the Amamiya throttle idea, but implemented in software 408 - (compiler-inserted dataflow logic) rather than hardware 409 - 410 - **Runtime overflow (safety net):** 411 - - If a token arrives for a full matching store (shouldn't happen with 412 - correct compilation), the PE stalls the input FIFO until a slot frees. 413 - Simplest, safest, most debuggable. If it fires, something is wrong 414 - and stalling surfaces the bug. 415 - 416 - **Future: small CAM overflow buffer** 417 - - If runtime overflow becomes a real issue (genuinely unpredictable 418 - recursion depth), a small CAM (4-8 entries using 100142 chips or 419 - similar) per PE could catch overflow tokens 420 - - Sits between input FIFO and SRAM matching store, catches tokens 421 - that don't fit, retries when slots free up 422 - - Not needed for v0. The input FIFO interface doesn't change. 423 - - 100142 chips (4 words x 4 bits) could give a 4-entry overflow buffer 424 - at maybe 6-8 chips per PE. Small but might handle 95% of overflow 425 - cases where a slot frees within a few cycles. 426 - 427 - ## Context Slot Lifecycle 428 - 429 - ### Allocation: Bump Allocator 430 - - Counter + register per PE 431 - - On function activation: current counter value = new context slot ID 432 - - Counter increments (wraps around to 0 after max slot) 433 - - On wrap: checks occupied bitmap to find next free slot 434 - - Hardware: binary counter + bitmap register + priority encoder for 435 - free-slot finding. ~8-10 TTL chips. 436 - - Alternative: small FIFO of free slot IDs, populated at init and on 437 - deallocation. Avoids bitmap scan. ~5-8 chips. 438 - 439 - ### Deallocation 440 - - Compiler inserts explicit "free" instruction on every exit path 441 - - Free instruction clears the slot's occupied bits (all entries in 442 - the slot) and returns the slot ID to the free pool 443 - - Multiple frees are idempotent / harmless 444 - - Freed slots are immediately available for reallocation 445 - 446 - ### ABA Protection 447 - - 2-bit generation counter per context slot 448 - - Incremented on each reallocation 449 - - Tokens carry the generation they were created under 450 - - On match attempt: if token's generation != slot's current generation, 451 - the token is stale and discarded 452 - - 4 generations before wraparound; stale tokens drain in 2-5 cycles, 453 - so wraparound collision is effectively impossible 454 - - Hardware cost: 2-bit counter + 2-bit comparator per slot. Trivial. 455 - 456 - ### Throttle 457 - - Saturating counter tracks number of active (occupied) slots per PE 458 - - When counter = max slots, stalls new allocations until a free occurs 459 - - Prevents matching store overflow 460 - - Hardware cost: counter + comparator + gate. ~10 TTL chips. 461 - - With compiler-controlled scheduling, the throttle should rarely fire. 462 - It's a safety net, not a performance mechanism. 463 - 464 - ## Open Design Questions 465 - 466 - 1. **Context slot sizing**: Config B (16x32) vs Config C (32x32)? 467 - Depends on realistic concurrent activation counts for target programs. 468 - Need to compile some test programs and measure. 469 - 2. **Matching store metadata storage**: flip-flop register file for 470 - occupied/port/gen, or tiny SRAM? Depends on slot count and available 471 - chip count budget per PE. 472 - 3. **Instruction memory layout**: dyadic-first packing (option b) seems 473 - clean. Any cases where this constraint causes the compiler grief? 474 - 4. **Free slot tracking**: bump allocator + bitmap + priority encoder? 475 - Or free-slot FIFO? 476 - 5. **Instruction encoding**: operation set, format, how wide. Not yet 477 - specified. Must be wide enough to hold opcode + destination PE + dest 478 - offset + dest ctx_slot + any literals. 479 - 6. **Function splitting heuristics**: how does the compiler decide where 480 - to split? Minimise cross-PE traffic? Balance slot usage across PEs? 481 - Hardware constraints (slot count, entry count) drive it. 482 - 7. **Token format ctx_slot field width**: 4 bits (current, 16 slots) 483 - or 5 bits (32 slots, costs one bit from elsewhere)? 484 - 485 - ## Key References 486 - 487 - - `17407_17358.pdf` — DFM evaluation: OM structure (1024 CAM blocks, 488 - 32 words each, 8 entries of 4 words, 4-way set-associative within 489 - entry). Function activation via CCU requesting least-loaded PE, then 490 - getting instance name from target PE's free instance table. IM is 491 - 8KW/PE, identical across all PEs. Critical for understanding why 492 - Amamiya's OM is so large and why ours can be much smaller. 493 - - `gurd1985.pdf` — Manchester matching unit: 16 parallel hash banks, 494 - 64K tokens each, 54-bit comparators, 180ns clock period. Overflow 495 - unit emulated in software. Shows the cost of general-purpose matching. 496 - - `Dataflow_Machine_Architecture.pdf` — Veen survey: matching store 497 - analysis, tag space management, overflow handling across multiple 498 - architectures. 499 - - `amamiya1982.pdf` — Original DFM paper: semi-CAM concept, IM/OM 500 - split, execution control mechanism with associative IM fetch. 501 - Partial function body execution (begin executing when first argument 502 - arrives, don't wait for all arguments).

-560

design-notes/versions/pe-design(2).md

··· 1 - # Dynamic Dataflow CPU — PE (Processing Element) Design 2 - 3 - Covers the CM (Control Module) pipeline, matching store, instruction memory, 4 - context slot management, and per-PE identity. 5 - 6 - See `architecture-overview.md` for token format and module taxonomy. 7 - See `network-and-communication.md` for how tokens enter/leave the PE. 8 - 9 - ## Design Philosophy: Static Assignment, Compiler-Driven Sizing 10 - 11 - This design diverges significantly from both Manchester and Amamiya in how 12 - PEs are used. Understanding the difference is critical to understanding why 13 - the matching store can be so much smaller here. 14 - 15 - **Amamiya DFM (1982/17407 papers):** every PE has ALL function bodies 16 - pre-loaded in instruction memory (8KW, 58 bits/word per PE, identical 17 - contents across all PEs). Function *instances* are dynamically assigned to 18 - PEs at runtime by a CCU (Cluster Control Unit) that picks the least-loaded 19 - PE. The OM (operand matching memory) needs 1024 CAM blocks per PE because 20 - any function can run anywhere, and deep Lisp recursion means many 21 - simultaneous activations. The "semi-CAM" was their solution to making this 22 - affordable — instance name directly addresses a block, then 4-way 23 - set-associative lookup within the block on instruction identifier. 24 - 25 - **Manchester (Gurd 1985):** similar story but with hashing instead of 26 - semi-CAM. 16 parallel 64K-token memory banks per PE for set-associative 27 - hash lookup. 1M token capacity matching store. Plus an overflow unit 28 - (initially emulated on the host). The matching unit alone was 16 memory 29 - boards per PE. 30 - 31 - Both machines sized their matching stores for worst-case dynamic scheduling 32 - of arbitrary programs. The whole program lives in every PE (or in a single 33 - PE's matching unit), and any activation can land anywhere. That's why 34 - those matching stores are enormous. 35 - 36 - **This design:** the compiler statically assigns function bodies (or chunks 37 - of them) to specific PEs. Different PEs have different instruction memory 38 - contents. The compiler knows at compile time which functions run where, 39 - and can calculate maximum concurrent activations per PE. This means: 40 - 41 - - Instruction memory is NOT replicated — each PE only holds its assigned 42 - function bodies. IM can be much smaller. 43 - - The matching store only needs enough context slots for the maximum 44 - concurrent activations the compiler predicts for that specific PE. 45 - Not 1024. Probably 16-32. 46 - - No CCU needed for dynamic PE allocation. Scheduling decisions are 47 - made at compile time. 48 - - The tradeoff is scheduling flexibility — you can't dynamically 49 - rebalance load at runtime. The compiler must get it roughly right. 50 - 51 - ### Function Splitting Across PEs 52 - 53 - A "function" in the source language does NOT need to map 1:1 to a 54 - contiguous block on one PE. The compiler can split a function body at 55 - any data-dependency boundary. The token network doesn't know or care 56 - whether two instructions are "in the same function" — it just sees tokens 57 - with destinations. 58 - 59 - A 40-instruction function body could be split into three chunks of ~13 60 - instructions across three PEs, each chunk fitting in a smaller context 61 - slot. The "function" as the architecture sees it is really "a set of 62 - instructions that share a context slot ID on this PE." The compiler 63 - defines what that grouping means. 64 - 65 - This is a powerful lever for keeping context slots small: if a function 66 - body is too big for the slot size, the compiler splits it. The split 67 - introduces inter-PE token traffic (extra network hops), but keeps 68 - per-PE hardware simple. The compiler can optimise the split points to 69 - minimise cross-PE traffic. 70 - 71 - **Implication for context slot semantics:** a context slot doesn't mean 72 - "one function activation." It means "one chunk of work sharing a local 73 - operand namespace on this PE." Multiple context slots on different PEs 74 - might collectively represent one function activation. The token's ctx_slot 75 - field scopes operand matching to a local context, nothing more. 76 - 77 - **Implication for the compiler:** this architecture actively wants either 78 - small functions or functions distributed across PEs. The compiler is free 79 - to treat any subgraph of the dataflow graph as a "chunk" and assign it to 80 - a PE, regardless of source-level function boundaries. Loop bodies, branch 81 - arms, pipeline stages — all valid chunk boundaries. The grain of 82 - scheduling is the subgraph, not the function. 83 - 84 - ## PE Identity 85 - 86 - Each PE has a unique ID used for routing. Two mechanisms, not mutually 87 - exclusive: 88 - 89 - **EEPROM-based**: the instruction decoder EEPROM already contains 90 - per-PE truth tables. The PE ID can be encoded as additional input bits 91 - to the EEPROM, meaning the EEPROM contents are unique per PE but the 92 - circuit board is identical. The instruction decoder "knows" which PE 93 - it is because its EEPROM was burned with that ID. 94 - 95 - **DIP switches**: 3-4 switches give 8-16 PE addresses. Better for early 96 - prototyping — reconfigurable without reflashing. Can coexist with the 97 - EEPROM approach (switches provide ID bits that feed into the EEPROM 98 - address lines). 99 - 100 - The PE ID is needed in two places: 101 - 1. Input token filtering: "is this token addressed to me?" 102 - 2. Output token formatting: "set the source PE field" (if result tokens 103 - carry source info for return routing) 104 - 105 - ## PE Pipeline (5-stage sketch) 106 - 107 - ``` 108 - Stage 1: TOKEN INPUT 109 - - Receive token from network 110 - - Classify: type 00/01 (normal), type 11 subtype 01 (config write) 111 - - Normal tokens -> pipeline FIFO 112 - - Config writes -> instruction memory write port (stalls pipeline) 113 - - Buffer in small FIFO (8-deep, 32-bit) 114 - - ~1K transistors (flip-flops) or use small SRAM 115 - 116 - Stage 2: MATCH / BYPASS 117 - - Type 00 (dyadic): direct-index into context slot array 118 - - Check generation counter: mismatch = stale, discard 119 - - First operand: store in slot, advance to wait state 120 - - Second operand: read partner from slot, both proceed 121 - - Type 01 (monadic): bypass matching entirely, proceed directly 122 - - Single cycle for all cases (no hash path, no CAM search — 123 - direct indexing only, see matching store section below) 124 - - Estimated: ~200-300 transistors + SRAM 125 - 126 - Stage 3: INSTRUCTION FETCH 127 - - Use local offset to read from PE's instruction SRAM 128 - - External SRAM chip, so just address generation logic 129 - - ~200 transistors of logic 130 - - NOTE: instruction memory is shared between pipeline reads and 131 - network config writes — see "Instruction Memory" section below 132 - 133 - Stage 4: EXECUTE 134 - - 8/16-bit ALU 135 - - ~500-2000 transistors depending on width and features 136 - 137 - Stage 5: TOKEN OUTPUT 138 - - Form result token with routing prefix (type, destination PE/SM, 139 - offset, context, etc.) 140 - - Inject into network via output FIFO 141 - - ~300 transistors 142 - ``` 143 - 144 - Pipeline registers between stages: ~500 transistors 145 - Control logic (state machine, handshaking): ~500-1000 transistors 146 - 147 - **Per-PE total: ~3-5K transistors of logic + SRAM chips** 148 - 149 - (Revised down from earlier 5-8K estimate. The matching stage is dramatically 150 - simpler than originally sketched now that hash fallback is removed from the 151 - primary pipeline. See matching store section.) 152 - 153 - ## Instruction Memory 154 - 155 - ### Static Assignment, Per-PE Contents 156 - 157 - Unlike Amamiya where every PE has identical IM contents (full program), 158 - each PE here holds only the function bodies (or function chunks) assigned 159 - to it by the compiler. This means: 160 - 161 - - IM is smaller per PE (only assigned code, not the whole program) 162 - - Different PEs have different IM contents (loaded at bootstrap) 163 - - The compiler emits a per-PE instruction image as part of the program 164 - 165 - ### Runtime Writability 166 - 167 - Instruction memory is **not** read-only. It is writable from the network 168 - via type-11 subtype-01 (config/extended address) packets. This serves 169 - two purposes: 170 - 171 - 1. **Bootstrap**: loading programs before execution starts 172 - 2. **Runtime reprogramming**: loading new function bodies while other PEs 173 - continue executing (future capability, not needed for v0) 174 - 175 - Runtime writability also means instruction memory size is not a hard 176 - architectural limit — if a program needs more code than fits in one PE's 177 - IM, the runtime (or a management PE) could swap function bodies in and 178 - out. Very speculative, but the hardware path exists. 179 - 180 - ### Implementation 181 - 182 - Instruction memory is external SRAM. The PE pipeline reads from it during 183 - Stage 3 (instruction fetch). The network can write to it via config 184 - packets received at Stage 1. 185 - 186 - Shared SRAM means arbitration between two users: 187 - - Pipeline reads (instruction fetch): high frequency, performance-critical 188 - - Network writes (config): low frequency, can tolerate delay 189 - 190 - **Arbitration approach**: network writes get priority when they arrive 191 - (they're rare and bursty during bootstrap). When a config write is in 192 - progress, the pipeline stalls for one cycle at Stage 3. Hardware cost: 193 - mux on SRAM address/data buses + write-enable gating + stall signal to 194 - pipeline. Roughly 5-8 TTL chips. 195 - 196 - **Async-compatible arbitration**: defined as request/grant interface. 197 - Synchronous implementation: priority mux resolved on clock edge. Async 198 - implementation: mutual exclusion element (Seitz arbiter). Interface is 199 - the same in both cases. See `network-and-communication.md` for clocking 200 - discipline. 201 - 202 - ### EEPROM-Based Instruction Decoding 203 - 204 - The instruction decoder can be implemented as an EEPROM acting like a PLD. 205 - Input bits = instruction opcode fields + PE ID bits. Output bits = control 206 - signals for the ALU, matching store, token output formatter, etc. 207 - 208 - This gives significant flexibility: 209 - - Instruction set can be changed by reflashing the EEPROM (no board changes) 210 - - Per-PE customisation (different PEs could theoretically have different 211 - instruction subsets, though unlikely for v0) 212 - - The PE ID is "free" — it's just more EEPROM address bits 213 - 214 - ## Matching Store Design 215 - 216 - ### Why It Can Be Small 217 - 218 - The matching store is the highest-risk component in any dataflow machine. 219 - Manchester needed 16 memory boards per PE. Amamiya needed 1024 CAM blocks 220 - (32KW at 43 bits/word) per PE. Both were sized for worst-case dynamic 221 - scheduling of arbitrary programs. 222 - 223 - This design avoids that because: 224 - 225 - 1. **Static PE assignment**: the compiler knows which functions run on 226 - which PE and can calculate maximum concurrent activations per PE. 227 - 2. **Function splitting**: the compiler can split large function bodies 228 - across PEs so no single PE needs a huge context slot. 229 - 3. **Compiler-controlled slot allocation**: the compiler assigns context 230 - slot IDs at compile time for statically-known activations. Only 231 - genuinely dynamic activations (runtime-determined recursion depth) 232 - need runtime allocation. 233 - 234 - The matching store size is therefore a *compiler parameter*, not an 235 - architectural constant. The hardware provides N context slots of M entries 236 - each. The compiler must generate code that fits within those limits, 237 - splitting and scheduling accordingly. 238 - 239 - ### Architecture: Pure Direct-Indexed Context Slots 240 - 241 - No hash fallback. No CAM search. No set-associative lookup. 242 - 243 - The matching operation is: 244 - 245 - ``` 246 - SRAM_address = [ctx_slot (from token) : match_entry (from token or instr)] 247 - read SRAM at that address 248 - check generation counter: mismatch = stale, discard token 249 - 250 - if occupied bit set: 251 - -> match found, read stored operand, proceed to instruction fetch 252 - -> clear occupied bit 253 - else: 254 - -> no match, write incoming operand, set occupied bit 255 - -> token consumed, advance to next input token 256 - ``` 257 - 258 - Single cycle. Always. The only comparison is the generation counter check 259 - (2-bit XOR, trivial). There is no "miss" path that requires multi-cycle 260 - recovery, because the address is deterministic — the compiler guaranteed 261 - that ctx_slot + match_entry uniquely identifies this matching location. 262 - 263 - Hardware cost: 264 - - One SRAM chip (matching store data) 265 - - Small register file or SRAM (occupied bitmap + port flags) 266 - - Address generation: concatenate ctx_slot bits and match_entry bits (wires) 267 - - Read/write control: occupied bit check + generation compare (~3-4 chips) 268 - - Generation counter storage: 2 bits per context slot (~1 chip for 32 slots) 269 - 270 - Total matching logic per PE: **~200-300 transistors + one SRAM chip + bitmap.** 271 - Order of magnitude less than the 2-3K transistors estimated when the 272 - design included a hash fallback path. 273 - 274 - ### Context Slot Sizing 275 - 276 - The slot count (N) vs entry count (M) tradeoff maps to: 277 - - **N (slots)**: how many concurrent activations can this PE handle 278 - - **M (entries per slot)**: how many dyadic instructions per function chunk 279 - 280 - Both are compiler-controllable. More slots = more parallelism headroom. 281 - More entries per slot = bigger function bodies without splitting. The 282 - compiler balances these. 283 - 284 - **Candidate configurations (targeting clean SRAM utilisation):** 285 - 286 - ``` 287 - Config A: 32 slots x 16 entries = 512 cells 288 - - 9-bit SRAM address (5 ctx + 4 offset) 289 - - 16-bit values: 1KB exactly in one 8Kbit SRAM chip 290 - - Good concurrency headroom, smaller function chunks 291 - 292 - Config B: 16 slots x 32 entries = 512 cells 293 - - 9-bit SRAM address (4 ctx + 5 offset) 294 - - 16-bit values: 1KB exactly 295 - - Matches current 4-bit ctx_slot token format 296 - - Fewer concurrent activations, bigger function chunks 297 - 298 - Config C: 32 slots x 32 entries = 1024 cells 299 - - 10-bit SRAM address (5 ctx + 5 offset) 300 - - 16-bit values: 2KB, fits in one 16Kbit SRAM chip 301 - - Most headroom in both dimensions 302 - - Probably the sweet spot for SRAM utilisation vs headroom 303 - 304 - Config D: 64 slots x 16 entries = 1024 cells 305 - - 10-bit SRAM address (6 ctx + 4 offset) 306 - - 16-bit values: 2KB 307 - - Favours concurrency over function chunk size 308 - - 64 concurrent activations likely overkill for v0 but future-proof 309 - ``` 310 - 311 - **SRAM layout:** store 16-bit operand values in the main SRAM chip 312 - (standard 8-bit-wide chips, 2 bytes per entry, sequential access or 313 - use 16-bit-wide SRAM if available). Occupied flags + port indicators 314 - stored separately: 315 - 316 - - **Occupied bitmap**: 1 bit per cell. 512 cells = 64 bytes (trivially 317 - small — a few flip-flops or one tiny SRAM). 1024 cells = 128 bytes. 318 - - **Port indicator**: 1 bit per cell (left/right operand). Same size as 319 - occupied bitmap. Can share the same storage. 320 - - **Generation counters**: 2 bits per *slot* (not per cell). 32 slots = 321 - 8 bytes. Trivial — a small register file or a handful of flip-flops. 322 - 323 - This separation means the main SRAM stores only 16-bit values at clean 324 - power-of-two addresses. No awkward 18-bit word widths. The metadata 325 - (occupied, port, gen) is tiny and stored in dedicated fast-access 326 - registers alongside the SRAM. 327 - 328 - **Recommendation for v0**: start with Config B (16 slots x 32 entries = 329 - 1KB) to match the current 4-bit ctx_slot token field. Upgrade to Config C 330 - (32 x 32 = 2KB, needs 5-bit ctx_slot) if 16 concurrent activations 331 - proves too tight. The physical SRAM chip doesn't change between these 332 - configs — just the address generation logic. 333 - 334 - ### Instruction Address vs Matching Store Address 335 - 336 - These are NOT the same thing, and this distinction matters: 337 - 338 - - **Instruction address** (used in Stage 3): indexes into instruction 339 - memory SRAM. 7-8 bits (128-256 instructions per PE). Used by ALL 340 - token types. This is the "offset" field in the token. 341 - - **Matching store address** (used in Stage 2): indexes into matching 342 - store SRAM. Composed of [ctx_slot : match_entry]. Only used by 343 - dyadic tokens. 344 - 345 - The compiler maintains the mapping. For dyadic instructions, the 346 - instruction word in IM includes a "match_entry" field that tells the 347 - hardware which matching store entry corresponds to this instruction. 348 - 349 - This means the matching store is dense with respect to dyadic instructions 350 - — no gaps for monadic instructions. A function chunk with 20 instructions, 351 - 8 of which are dyadic, uses 8 matching store entries, not 20. 352 - 353 - **Simplest v0 approach:** the token carries the instruction memory offset 354 - (for Stage 3). The instruction word fetched in Stage 3 contains the 355 - match_entry index (for Stage 2, which already happened on the previous 356 - pipeline cycle for the previous token — or, more precisely, the match 357 - stage reads match_entry from a lookup table indexed by offset, stored 358 - alongside the instruction memory). 359 - 360 - Actually, wait — Stage 2 happens BEFORE Stage 3. So the match_entry 361 - must come from the token, not the instruction word. This means either: 362 - 363 - (a) The token carries both an instruction offset AND a match entry index. 364 - Costs token bits. May require the offset field to be split or the 365 - match_entry to be packed into unused bits. 366 - 367 - (b) The match_entry IS the instruction offset, and the instruction memory 368 - is laid out so that the offset of a dyadic instruction is also its 369 - matching store entry within the slot. This works if the compiler 370 - assigns offsets such that dyadic instruction offsets are dense (0, 1, 371 - 2, ...) and monadic instruction offsets are in a separate range. 372 - 373 - (c) A small lookup ROM/SRAM alongside the matching store maps instruction 374 - offset -> match_entry. This adds a read before the match SRAM access 375 - (serial, adds latency) or requires a second SRAM port (parallel, adds 376 - hardware). 377 - 378 - Option (b) is the simplest if the compiler can make it work. The 379 - instruction memory layout would be: dyadic instructions at offsets 0..M-1, 380 - monadic instructions at offsets M..N-1. The token's offset field directly 381 - indexes both the matching store (for dyadic) and the instruction memory 382 - (for everything). The matching store just doesn't get accessed for offsets 383 - >= M (monadic range). 384 - 385 - This constrains instruction memory layout — dyadic instructions must be 386 - packed at the low end. But the compiler controls the layout, so this is 387 - achievable. 388 - 389 - **Recommendation for v0:** option (b). Dyadic instructions packed at 390 - offsets 0..M-1 in instruction memory, monadic at M..N-1. Token offset 391 - directly serves as both instruction address and (for offsets < M) 392 - matching store entry within the context slot. Clean, no extra bits, 393 - no extra lookup, single cycle. Constraint on the compiler, not on the 394 - hardware. 395 - 396 - ### What About Overflow? 397 - 398 - If the matching store is full (all slots occupied) or a function body 399 - exceeds M dyadic instructions: 400 - 401 - **Compile-time prevention (primary strategy):** 402 - - The compiler knows the slot count and entry count 403 - - It splits functions and schedules activations to fit 404 - - If a program genuinely can't fit (unbounded recursion deeper than N 405 - slots), the compiler inserts throttling code: a token that waits for 406 - a slot to free before allowing the next recursive call 407 - - This is the Amamiya throttle idea, but implemented in software 408 - (compiler-inserted dataflow logic) rather than hardware 409 - 410 - **Runtime overflow (safety net):** 411 - - If a token arrives for a full matching store (shouldn't happen with 412 - correct compilation), the PE stalls the input FIFO until a slot frees. 413 - Simplest, safest, most debuggable. If it fires, something is wrong 414 - and stalling surfaces the bug. 415 - 416 - **Future: small CAM overflow buffer** 417 - - If runtime overflow becomes a real issue (genuinely unpredictable 418 - recursion depth), a small CAM (4-8 entries using 100142 chips or 419 - similar) per PE could catch overflow tokens 420 - - Sits between input FIFO and SRAM matching store, catches tokens 421 - that don't fit, retries when slots free up 422 - - Not needed for v0. The input FIFO interface doesn't change. 423 - - 100142 chips (4 words x 4 bits) could give a 4-entry overflow buffer 424 - at maybe 6-8 chips per PE. Small but might handle 95% of overflow 425 - cases where a slot frees within a few cycles. 426 - 427 - ## Context Slot Lifecycle 428 - 429 - ### Allocation: Bump Allocator 430 - - Counter + register per PE 431 - - On function activation: current counter value = new context slot ID 432 - - Counter increments (wraps around to 0 after max slot) 433 - - On wrap: checks occupied bitmap to find next free slot 434 - - Hardware: binary counter + bitmap register + priority encoder for 435 - free-slot finding. ~8-10 TTL chips. 436 - - Alternative: small FIFO of free slot IDs, populated at init and on 437 - deallocation. Avoids bitmap scan. ~5-8 chips. 438 - 439 - ### Deallocation 440 - - Compiler inserts explicit "free" instruction on every exit path 441 - - Free instruction clears the slot's occupied bits (all entries in 442 - the slot) and returns the slot ID to the free pool 443 - - Multiple frees are idempotent / harmless 444 - - Freed slots are immediately available for reallocation 445 - 446 - ### ABA Protection 447 - - 2-bit generation counter per context slot 448 - - Incremented on each reallocation 449 - - Tokens carry the generation they were created under 450 - - On match attempt: if token's generation != slot's current generation, 451 - the token is stale and discarded 452 - - 4 generations before wraparound; stale tokens drain in 2-5 cycles, 453 - so wraparound collision is effectively impossible 454 - - Hardware cost: 2-bit counter + 2-bit comparator per slot. Trivial. 455 - 456 - ### Throttle 457 - - Saturating counter tracks number of active (occupied) slots per PE 458 - - When counter = max slots, stalls new allocations until a free occurs 459 - - Prevents matching store overflow 460 - - Hardware cost: counter + comparator + gate. ~10 TTL chips. 461 - - With compiler-controlled scheduling, the throttle should rarely fire. 462 - It's a safety net, not a performance mechanism. 463 - 464 - ## Open Design Questions 465 - 466 - 1. **Context slot sizing**: Config B (16x32) vs Config C (32x32)? 467 - Depends on realistic concurrent activation counts for target programs. 468 - Need to compile some test programs and measure. 469 - 2. **Matching store metadata storage**: flip-flop register file for 470 - occupied/port/gen, or tiny SRAM? Depends on slot count and available 471 - chip count budget per PE. 472 - 3. **Instruction memory layout**: dyadic-first packing (option b) seems 473 - clean. Any cases where this constraint causes the compiler grief? 474 - 4. **Free slot tracking**: bump allocator + bitmap + priority encoder? 475 - Or free-slot FIFO? 476 - 5. **Instruction encoding**: operation set, format, how wide. Not yet 477 - specified. Must be wide enough to hold opcode + destination PE + dest 478 - offset + dest ctx_slot + any literals. 479 - 6. **Function splitting heuristics**: how does the compiler decide where 480 - to split? Minimise cross-PE traffic? Balance slot usage across PEs? 481 - Hardware constraints (slot count, entry count) drive it. 482 - 7. **Token format ctx_slot field width**: 4 bits (current, 16 slots) 483 - or 5 bits (32 slots, costs one bit from elsewhere)? 484 - 485 - ## Dynamic Scheduling: Future Capability 486 - 487 - The architecture is **policy-agnostic** on whether PE assignment is fully 488 - static (compiler decides everything) or partially dynamic (a scheduler 489 - places activations at runtime). the mechanism — tokens carry destination 490 - PE + ctx_slot, PEs have writable IRAM, matching store is addressed by 491 - ctx_slot — supports either policy. 492 - 493 - ### Static Assignment (v0) 494 - 495 - Compiler decides everything at compile time. each PE gets specific 496 - function fragments loaded at bootstrap. no runtime decisions about 497 - placement. simplest, no scheduler hardware or firmware needed. 498 - 499 - ### Dynamic Scheduling (future) 500 - 501 - A CCU-like scheduler (could be firmware on a dedicated PE, a small 502 - fixed-function unit, or distributed logic) decides at runtime where to 503 - place new activations, based on PE load, IRAM contents, etc. 504 - 505 - The tension: dynamic scheduling wants **wide IRAM** (so the target PE 506 - already has the function body loaded), while cheap PEs want **narrow 507 - IRAM**. Amamiya resolved this by replicating the entire program into 508 - every PE's IRAM. that's one approach but costs a lot of memory. 509 - 510 - The middle ground is a **working set model**: keep hot function bodies 511 - loaded, swap cold ones via type-11 config writes when the scheduler 512 - wants to place an activation on a PE that doesn't have the code yet. 513 - this is demand paging for instruction memory. 514 - 515 - - **miss latency**: significant (network round-trip to load code from 516 - flash/SM/another PE's IRAM). much worse than Amamiya's "already there." 517 - - **miss rate**: depends on scheduler affinity policy. if the scheduler 518 - prefers placing activations on PEs that already have the code, misses 519 - should be rare. a small "IRAM directory" (which PE has which function 520 - body loaded) lets the scheduler make this decision cheaply. 521 - - **coordination**: drain in-flight tokens for the old fragment before 522 - overwriting IRAM. throttle stalls new activations for that fragment, 523 - existing ones complete, then overwrite. coarse-grained context switch. 524 - 525 - The hardware path is already there — writable IRAM + type-11 config 526 - writes + throttle. the missing piece is the scheduler, which is a 527 - software/firmware problem. nothing in the v0 hardware prevents adding 528 - this later. 529 - 530 - ### What Changes If You Want Dynamic Scheduling 531 - 532 - The main hardware implication: if the same function body might run on 533 - different PEs at different times, the **instruction memory needs to be 534 - large enough to hold a useful working set**, not just one program's 535 - worth of fragments. this argues for bigger IRAM per PE (4Kx8 or 8Kx8 536 - instead of 2Kx8) even if v0 programs don't need it. SRAM is cheap; 537 - leaving headroom costs one chip size bump, not a redesign. 538 - 539 - the matching store size is less affected — context slot count is about 540 - concurrency, not code size. 16-32 slots handles most realistic 541 - activation depths regardless of whether assignment is static or dynamic. 542 - 543 - ## Key References 544 - 545 - - `17407_17358.pdf` — DFM evaluation: OM structure (1024 CAM blocks, 546 - 32 words each, 8 entries of 4 words, 4-way set-associative within 547 - entry). Function activation via CCU requesting least-loaded PE, then 548 - getting instance name from target PE's free instance table. IM is 549 - 8KW/PE, identical across all PEs. Critical for understanding why 550 - Amamiya's OM is so large and why ours can be much smaller. 551 - - `gurd1985.pdf` — Manchester matching unit: 16 parallel hash banks, 552 - 64K tokens each, 54-bit comparators, 180ns clock period. Overflow 553 - unit emulated in software. Shows the cost of general-purpose matching. 554 - - `Dataflow_Machine_Architecture.pdf` — Veen survey: matching store 555 - analysis, tag space management, overflow handling across multiple 556 - architectures. 557 - - `amamiya1982.pdf` — Original DFM paper: semi-CAM concept, IM/OM 558 - split, execution control mechanism with associative IM fetch. 559 - Partial function body execution (begin executing when first argument 560 - arrives, don't wait for all arguments).

-212

design-notes/versions/pe-design.md

··· 1 - # Dynamic Dataflow CPU — PE (Processing Element) Design 2 - 3 - Covers the CM (Control Module) pipeline, matching store, instruction memory, 4 - context slot management, and per-PE identity. 5 - 6 - See `architecture-overview.md` for token format and module taxonomy. 7 - See `network-and-communication.md` for how tokens enter/leave the PE. 8 - 9 - ## PE Identity 10 - 11 - Each PE has a unique ID used for routing. two mechanisms, not mutually 12 - exclusive: 13 - 14 - **EEPROM-based**: the instruction decoder EEPROM already contains 15 - per-PE truth tables. the PE ID can be encoded as additional input bits 16 - to the EEPROM, meaning the EEPROM contents are unique per PE but the 17 - circuit board is identical. the instruction decoder "knows" which PE 18 - it is because its EEPROM was burned with that ID. 19 - 20 - **DIP switches**: 3-4 switches give 8-16 PE addresses. better for early 21 - prototyping — reconfigurable without reflashing. can coexist with the 22 - EEPROM approach (switches provide ID bits that feed into the EEPROM 23 - address lines). 24 - 25 - the PE ID is needed in two places: 26 - 1. input token filtering: "is this token addressed to me?" 27 - 2. output token formatting: "set the source PE field" (if result tokens 28 - carry source info for return routing) 29 - 30 - ## PE Pipeline (5-stage sketch) 31 - 32 - ``` 33 - Stage 1: TOKEN INPUT 34 - - Receive token from network 35 - - Classify: type 00/01 (normal), type 11 subtype 01 (config write) 36 - - Normal tokens -> pipeline FIFO 37 - - Config writes -> instruction memory write port (stalls pipeline) 38 - - Buffer in small FIFO (8-deep, 32-bit) 39 - - ~1K transistors (flip-flops) or use small SRAM 40 - 41 - Stage 2: MATCH / BYPASS 42 - - Type 00 (dyadic): direct-index into context slot array 43 - - Check generation counter: mismatch = stale, discard 44 - - First operand: store in slot, advance to wait state 45 - - Second operand: read partner from slot, both proceed 46 - - Type 01 (monadic): bypass matching entirely, proceed directly 47 - - Common case (direct index): single cycle 48 - - Hash fallback path for dynamic/overflow: multi-cycle 49 - - Estimated: 2-3K transistors + SRAM 50 - 51 - Stage 3: INSTRUCTION FETCH 52 - - Use local offset to read from PE's instruction SRAM 53 - - External SRAM chip, so just address generation logic 54 - - ~200 transistors of logic 55 - - NOTE: instruction memory is shared between pipeline reads and 56 - network config writes — see "Instruction Memory" section below 57 - 58 - Stage 4: EXECUTE 59 - - 8/16-bit ALU 60 - - ~500-2000 transistors depending on width and features 61 - 62 - Stage 5: TOKEN OUTPUT 63 - - Form result token with routing prefix (type, destination PE/SM, 64 - offset, context, etc.) 65 - - Inject into network via output FIFO 66 - - ~300 transistors 67 - ``` 68 - 69 - Pipeline registers between stages: ~500 transistors 70 - Control logic (state machine, handshaking): ~500-1000 transistors 71 - 72 - **Per-PE total: ~5-8K transistors of logic + SRAM chips** 73 - 74 - ## Instruction Memory 75 - 76 - ### Runtime Writability 77 - 78 - Instruction memory is **not** read-only. it is writable from the network 79 - via type-11 subtype-01 (config/extended address) packets. this serves 80 - two purposes: 81 - 82 - 1. **Bootstrap**: loading programs before execution starts 83 - 2. **Runtime reprogramming**: loading new function bodies while other PEs 84 - continue executing (future capability, not needed for v0) 85 - 86 - ### Implementation 87 - 88 - Instruction memory is external SRAM. the PE pipeline reads from it during 89 - Stage 3 (instruction fetch). the network can write to it via config 90 - packets received at Stage 1. 91 - 92 - Shared SRAM means arbitration between two users: 93 - - Pipeline reads (instruction fetch): high frequency, performance-critical 94 - - Network writes (config): low frequency, can tolerate delay 95 - 96 - **Arbitration approach**: network writes get priority when they arrive 97 - (they're rare and bursty during bootstrap). when a config write is in 98 - progress, the pipeline stalls for one cycle at Stage 3. hardware cost: 99 - mux on SRAM address/data buses + write-enable gating + stall signal to 100 - pipeline. 101 - 102 - **Async-compatible arbitration**: defined as request/grant interface. 103 - synchronous implementation: priority mux resolved on clock edge. async 104 - implementation: mutual exclusion element (Seitz arbiter). interface is 105 - the same in both cases. see `network-and-communication.md` for clocking 106 - discipline. 107 - 108 - ### EEPROM-Based Instruction Decoding 109 - 110 - The instruction decoder can be implemented as an EEPROM acting like a PLD. 111 - input bits = instruction opcode fields + PE ID bits. output bits = control 112 - signals for the ALU, matching store, token output formatter, etc. 113 - 114 - This gives significant flexibility: 115 - - instruction set can be changed by reflashing the EEPROM (no board changes) 116 - - per-PE customisation (different PEs could theoretically have different 117 - instruction subsets, though this is unlikely for v0) 118 - - the PE ID is "free" — it's just more EEPROM address bits 119 - 120 - ## Context Slot Lifecycle 121 - 122 - See `architecture-overview.md` for the high-level description. detailed 123 - hardware design below. 124 - 125 - ### Allocation: Bump Allocator 126 - - Counter + register per PE 127 - - On function activation: current counter value = new context slot ID 128 - - Counter increments 129 - - Hardware: binary counter + output register. ~5 TTL chips. 130 - 131 - ### Deallocation 132 - - Compiler inserts explicit "free" instruction on every exit path 133 - - Free instruction resets the slot's "occupied" bit 134 - - Multiple frees are idempotent / harmless 135 - - Freed slots are available for reuse by the bump allocator 136 - (allocator wraps around and checks "occupied" bits, or a free list 137 - is maintained — TBD) 138 - 139 - ### ABA Protection 140 - - 2-bit generation counter per context slot 141 - - Incremented on each reallocation 142 - - Tokens carry the generation they were created under 143 - - On match attempt: if token's generation != slot's current generation, 144 - the token is stale and discarded 145 - - 4 generations before wraparound; stale tokens drain in 2-5 cycles, 146 - so wraparound collision is effectively impossible 147 - - Hardware cost: 2-bit counter + 2-bit comparator per slot. small. 148 - 149 - ### Throttle 150 - - Saturating counter tracks number of active (occupied) slots per PE 151 - - When counter = max slots, stalls new allocations until a free occurs 152 - - Prevents matching store overflow 153 - - Hardware cost: counter + comparator + gate. ~10 TTL chips. 154 - 155 - ## Matching Store Design (highest-risk component) 156 - 157 - ### Primary Path: Direct-Indexed Context Slots (Amamiya semi-CAM) 158 - 159 - - Bump allocator assigns context slot IDs to function activations 160 - - Context slot ID directly addresses a bank of SRAM 161 - - Instruction offset within function body used as direct address within 162 - that bank 163 - - **Single-cycle matching for the common case** — no hashing, no search 164 - - This is the critical performance path. if this works well, the PE is 165 - competitive. if it doesn't, nothing else matters. 166 - 167 - ### Fallback Path: Hash-Based Matching 168 - 169 - For dynamic or overflow cases where direct indexing doesn't apply: 170 - 171 - - Multiplicative hashing: `(a * K) >> (w - m)` — simple to implement 172 - in hardware (shift register + adder chain, or lookup table) 173 - - Multi-bank (4-8 banks) checked in parallel for collision tolerance 174 - (Manchester-style set-associative) 175 - - Overflow to linked list or dedicated overflow buffer for worst case 176 - - This path is multi-cycle — acceptable because it's the uncommon case 177 - 178 - ### Compiler-Assisted Tag Assignment 179 - 180 - - Static-lifetime values get contiguous, dense tags — sequential readout, 181 - no hashing 182 - - Dynamic activations get allocated tags via bump allocator 183 - - Potential for hybrid: half of matching store uses precalculated tags, 184 - half uses runtime hash 185 - 186 - ### Monadic/Dyadic Optimisation (deferred) 187 - 188 - - Compiler assigns matching store indices only to dyadic nodes 189 - - Monadic nodes bypass matching, don't consume matching store cells 190 - - Requires indirection: matching store cell includes instruction address 191 - pointer 192 - - Cell width increases (~8 bits for instr_addr) but cell count decreases 193 - (~60% fewer) 194 - - local_offset in token = matching store index, NOT instruction address 195 - - **Deferred for v0**: simpler to have local_offset = instruction address 196 - = matching store address 197 - 198 - ## Open Design Questions 199 - 200 - 1. **Context slot count per CM** — 4 bits = 16 slots. each slot needs 201 - enough SRAM to hold one operand per instruction in the function body. 202 - if function bodies are up to 64 instructions, each slot is 64 x 16-bit 203 - = 128 bytes. 16 slots = 2KB SRAM for the matching store. is 16 enough? 204 - 2. **Free slot tracking** — bump allocator with wraparound + occupied bits? 205 - or explicit free list (small FIFO of freed slot IDs)? 206 - 3. **Hash fallback** — how many banks? what hash function exactly? worth 207 - prototyping in FPGA first (see `design-alternatives.md`)? 208 - 4. **Instruction encoding** — operation set, format, how wide. this 209 - determines Stage 3 and Stage 4 design. not yet specified. 210 - 5. **Instruction memory write protocol** — exact handshake between 211 - "config write arrived" and "pipeline stalled, writing SRAM." needs 212 - to be fully specified before building.

-208

design-notes/versions/sm-design.md

··· 1 - # Dynamic Dataflow CPU — SM (Structure Memory) Design 2 - 3 - Covers the SM interface protocol, operation set, banking scheme, address 4 - space extension, and hardware architecture. 5 - 6 - See `architecture-overview.md` for module taxonomy and token format. 7 - See `network-and-communication.md` for how SM connects to the bus. 8 - 9 - ## Role 10 - 11 - SM stores structured data (arrays, lists, heap) and performs operations on 12 - it. it is NOT used for I/O mapping — I/O lives in the type-11 subsystem 13 - (see `io-and-bootstrap.md`). 14 - 15 - SM is a pure data store with embedded functional units for atomic operations. 16 - from a CM's perspective: send a type-10 request, get a result token back 17 - eventually. split-phase, asynchronous relative to the requesting CM. 18 - 19 - ## Interface Protocol 20 - 21 - Stateless request handling: the request token carries its own return routing 22 - info in the bits that are unused by that operation type. SM never maintains 23 - pending-request state — result packets are self-addressed. 24 - 25 - ### Request Formats (type 10, received on AN) 26 - 27 - ``` 28 - READ request (data field repurposed for return routing): 29 - [type:2][SM_id:2][op:3][address:9][ret_CM:2][ret_offset:8][ret_ctx:4][ret_port:1][pad:1] 30 - 31 - WRITE request (data field carries write data, no response needed): 32 - [type:2][SM_id:2][op:3][address:9][data:16] 33 - 34 - READ_INC / READ_DEC (same as READ format — return routing in data field): 35 - [type:2][SM_id:2][op:3][address:9][ret_CM:2][ret_offset:8][ret_ctx:4][ret_port:1][pad:1] 36 - 37 - CAS — compare-and-swap (two-flit operation): 38 - Flit 1: [type:2][SM_id:2][op:3][address:9][expected_value:16] 39 - Flit 2: [new_value:16][ret_CM:2][ret_offset:8][ret_ctx:4][ret_port:1][pad:1] 40 - ``` 41 - 42 - ### Result Format (on DN, repackaged as type 00 or 01) 43 - 44 - SM extracts return routing from the request and constructs a normal token: 45 - 46 - ``` 47 - Result -> type 00 (dyadic) or type 01 (monadic) token: 48 - [type:2][ret_CM:2][ret_ctx:4][gen:?][ret_offset:7/8][ret_port:1][fetched_data:14/20] 49 - ``` 50 - 51 - The requesting CM specified where this result should land (which context 52 - slot, which offset, which port). SM just repackages. the result looks 53 - like any other token arriving at the CM — the CM doesn't know or care 54 - that it came from SM. 55 - 56 - **Open question**: the return routing in READ requests carries ret_ctx (4 bits) 57 - but not gen (2 bits). the result token needs gen if it's type 00 (dyadic). 58 - either: (a) SM result tokens are always monadic (type 01, no gen needed), 59 - or (b) the ret_ctx field is widened to include gen bits (eating into 60 - padding), or (c) the requesting CM stores the gen locally and the result 61 - matches without it. option (a) is simplest — SM results bypass matching 62 - and go straight to instruction fetch. this means SM results always feed 63 - monadic instruction inputs. 64 - 65 - ## Operation Set (3-bit opcode, 8 slots) 66 - 67 - ``` 68 - 000: READ — read address, return data via DN 69 - 001: WRITE — write data to address (no DN response) 70 - 010: READ_INC — atomic fetch-and-add(+1), return old value 71 - 011: READ_DEC — atomic fetch-and-add(-1), return old value 72 - 100: CAS — compare-and-swap (two-flit), return old value + success bit 73 - 101: ALLOC — (future) allocate N cells, return base address 74 - 110: FREE — (future) mark cells as available 75 - 111: RESERVED 76 - ``` 77 - 78 - READ_INC / READ_DEC are fetch-and-add primitives. they give atomic pointer 79 - operations and reference counting without dedicated refcount hardware. CM 80 - checks returned value for zero (refcount exhausted) using its normal ALU. 81 - 82 - CAS is the general-purpose atomic primitive. two-flit: first flit carries 83 - the expected value, second flit carries the new value + return routing. 84 - SM compares memory contents with expected, swaps if match, returns old 85 - value either way. success/fail can be inferred by comparing returned old 86 - value with expected (CM does this with its ALU). 87 - 88 - ALLOC / FREE are placeholders for heap management. deferred to post-v0. 89 - could be implemented as firmware (a small state machine in the SM that 90 - manages a free list) or as software (the CM program manages allocation 91 - using READ/WRITE/CAS). 92 - 93 - ## Hardware Architecture 94 - 95 - ``` 96 - Input Interface Output Interface 97 - (receive type-10 request) (send result as type 00/01) 98 - | ^ 99 - v | 100 - [Request FIFO] [Result FIFO] 101 - | ^ 102 - v | 103 - [Op Decoder]----+ [Result Formatter] 104 - | | ^ 105 - v v | 106 - [Addr Decode] [ALU for inc/dec/cas] [Bank Read Data] 107 - | | ^ 108 - v v | 109 - [SRAM Bank 0] [SRAM Bank 1] ... [SRAM Bank N] 110 - ``` 111 - 112 - ### Banking 113 - 114 - - Start with 2 banks (1 address bit selects bank) for v0 115 - - 9-bit address = 512 cells per SM = 1KB at 16-bit data width 116 - - Each bank is one SRAM chip with room to spare 117 - - Banking allows pipelining: one bank can be reading while another is 118 - being written (for RMW ops, or overlapping independent requests) 119 - 120 - ### Internal Components 121 - 122 - **Request FIFO**: buffers incoming type-10 packets. depth TBD (4-8 deep 123 - probably sufficient for v0). handles bursty traffic from multiple CMs. 124 - 125 - **Op decoder**: extracts opcode, determines: 126 - - read / write / RMW? 127 - - one-flit or two-flit? (CAS is two-flit) 128 - - does it need a DN response? 129 - - how to pack the result? 130 - 131 - **Address decode**: selects SRAM bank from address bits. 132 - 133 - **ALU**: minimal — increment, decrement, compare. NOT a full ALU. just 134 - enough for the atomic operations. hardware cost: 16-bit incrementer + 135 - 16-bit comparator + mux. ~10-15 TTL chips. 136 - 137 - **Result formatter**: extracts return routing from the original request 138 - (ret_CM, ret_offset, ret_ctx, ret_port), combines with read data, 139 - constructs a type 00/01 token. this is where the SM-to-DN format 140 - conversion happens. 141 - 142 - ## Address Space Extension 143 - 144 - The 9-bit address in the compact structure token (type 10) gives only 145 - 512 cells per SM. three mechanisms to extend it: 146 - 147 - ### 1. Page Register (recommended for v0) 148 - 149 - - SM has a writable config register: "page base" (8-16 bits) 150 - - 9-bit token address is treated as offset, added to page base 151 - - Gives up to 64K+ addressable cells per SM 152 - - CM sets the page with a WRITE to a reserved config address before 153 - issuing a burst of reads/writes to a region 154 - - Hardware cost: ~3 chips (latch for page register + adder) 155 - - Programming model: familiar bank-switching, like 8-bit micros 156 - - Tradeoff: page switch costs one extra token; compiler batches accesses 157 - to same page to amortise 158 - 159 - ### 2. Banking as Implicit Address Bits 160 - 161 - - SM_id field (2 bits) gives 4 SMs = 4 x 512 = 2K cells system-wide 162 - - Not contiguous from a programming perspective, but compiler can 163 - distribute data structures across SMs for both capacity and parallelism 164 - - Essentially free — already in the token format 165 - - Combine with page registers for 4 x 64K = 256K cells system-wide 166 - 167 - ### 3. Extended Structure Tokens (via type 11) 168 - 169 - - Use type-11 (system) packets with a structure-extended subtype for 170 - structure ops needing wide addresses 171 - - Full 16-24 bit address space, at the cost of 2-cycle token transmission 172 - - Use for: large heap, external RAM chip 173 - - Compact type-10 tokens remain the fast path for common/local accesses 174 - 175 - ### Practical Address Space with All Three Combined 176 - 177 - - Fast path (type 10 + page register): 64K per SM, single-flit 178 - - Medium path (type 10 across SMs): 4 x 64K = 256K, single-flit 179 - - Slow path (type 11 extended): up to 16M+ with wide addresses, two-flit 180 - 181 - ## V0 Test Plan 182 - 183 - - Drive input with microcontroller (RP2040 / Arduino) 184 - - Microcontroller formats 32-bit request packets, clocks into request FIFO 185 - - Read 32-bit result packets from output FIFO 186 - - Test suite: 187 - - Sequential read/write 188 - - Random access 189 - - READ_INC sequences (verify atomicity, verify returned old value) 190 - - READ_DEC to zero (verify underflow behaviour) 191 - - CAS success and failure cases 192 - - Bank contention (same bank back-to-back) 193 - - Page register set + offset access 194 - - Boundary conditions (address 0, address 511, page wraparound) 195 - 196 - ## Open Design Questions 197 - 198 - 1. **Result token type** — always monadic (type 01)? or sometimes dyadic? 199 - see "open question" in interface protocol section above. 200 - 2. **CAS two-flit handling** — how does the request FIFO handle two-flit 201 - ops? does it buffer both flits before dispatching, or pipeline them? 202 - 3. **Page register per-CM or global?** — if multiple CMs access the same 203 - SM, do they share a page register (contention) or each have their own 204 - (more hardware, more config)? probably global for v0. 205 - 4. **Banking vs pipeline depth** — with 2 banks, can we overlap a read to 206 - bank 0 with a write to bank 1? worth the control complexity for v0? 207 - 5. **SRAM chip selection** — specific part numbers, speed grades, package. 208 - needs to match the target clock frequency.

+2 -2

dfasm.lark

+1 -1

emu/alu.py

··· 132 132 return left, False 133 133 case RoutingOp.CONST: 134 134 return const & UINT16_MASK, False 135 - case RoutingOp.FREE: 135 + case RoutingOp.FREE_CTX: 136 136 return 0, False 137 137 case RoutingOp.SEL: 138 138 cmp = left != 0

+4 -4

emu/pe.py

··· 4 4 5 5 import simpy 6 6 7 - from cm_inst import ALUInst, Addr, ArithOp, LogicOp, RoutingOp, SMInst, is_monadic_alu 7 + from cm_inst import ALUInst, Addr, ArithOp, CfgOp, LogicOp, MemOp, Port, RoutingOp, SMInst, is_monadic_alu 8 8 from emu.alu import execute 9 9 from emu.types import MatchEntry 10 10 from tokens import ( 11 - CMToken, CfgOp, CfgToken, DyadToken, LoadInstToken, MemOp, 12 - MonadToken, Port, RouteSetToken, SMToken, 11 + CMToken, CfgToken, DyadToken, LoadInstToken, 12 + MonadToken, RouteSetToken, SMToken, 13 13 ) 14 14 15 15 logger = logging.getLogger(__name__) ··· 217 217 yield self.sm_routes[inst.sm_id].put(sm_token) 218 218 219 219 def _output_mode(self, inst: ALUInst, bool_out: bool) -> OutputMode: 220 - if inst.op == RoutingOp.FREE: 220 + if inst.op == RoutingOp.FREE_CTX: 221 221 return OutputMode.SUPPRESS 222 222 if inst.op == RoutingOp.GATE and not bool_out: 223 223 return OutputMode.SUPPRESS

+2 -1

emu/sm.py

··· 3 3 4 4 import simpy 5 5 6 + from cm_inst import MemOp 6 7 from emu.alu import UINT16_MASK 7 8 from emu.types import DeferredRead 8 9 from sm_mod import Presence, SMCell 9 - from tokens import CMToken, MemOp, MonadToken, SMToken 10 + from tokens import CMToken, MonadToken, SMToken 10 11 11 12 logger = logging.getLogger(__name__) 12 13

+4 -4

emu/types.py

··· 1 1 from dataclasses import dataclass 2 2 from typing import Optional 3 3 4 - from cm_inst import ALUInst, SMInst 4 + from cm_inst import ALUInst, Port, SMInst 5 5 from sm_mod import Presence 6 - from tokens import CMToken, Port 6 + from tokens import CMToken 7 7 8 8 9 9 @dataclass ··· 23 23 class PEConfig: 24 24 pe_id: int 25 25 iram: dict[int, ALUInst | SMInst] 26 - ctx_slots: int = 4 27 - offsets: int = 64 26 + ctx_slots: int = 16 27 + offsets: int = 128 28 28 gen_counters: Optional[list[int]] = None 29 29 allowed_pe_routes: Optional[set[int]] = None 30 30 allowed_sm_routes: Optional[set[int]] = None

+2 -2

tests/conftest.py

··· 4 4 from hypothesis import strategies as st 5 5 from lark import Lark 6 6 7 - from cm_inst import ArithOp, LogicOp, RoutingOp 8 - from tokens import CMToken, DyadToken, MemOp, MonadToken, Port, SMToken 7 + from cm_inst import ArithOp, LogicOp, MemOp, Port, RoutingOp 8 + from tokens import CMToken, DyadToken, MonadToken, SMToken 9 9 10 10 GRAMMAR_PATH = Path(__file__).parent.parent / "dfasm.lark" 11 11

+1 -2

tests/test_allocate.py

··· 22 22 ResolvedDest, 23 23 ) 24 24 from asm.errors import ErrorCategory 25 - from cm_inst import ArithOp 26 - from tokens import Port, MemOp 25 + from cm_inst import ArithOp, MemOp, Port 27 26 28 27 29 28 class TestIRAMPacking:

+1 -1

tests/test_alu.py

··· 309 309 @given(uint16) 310 310 def test_free_returns_zero(self, a): 311 311 """AC2.7: FREE returns 0.""" 312 - result, bool_out = execute(RoutingOp.FREE, a, None, None) 312 + result, bool_out = execute(RoutingOp.FREE_CTX, a, None, None) 313 313 assert result == 0 314 314 assert bool_out is False 315 315

+1 -2

tests/test_autoplacement.py

··· 11 11 from asm.ir import IRGraph, IRNode, IREdge, SystemConfig, SourceLoc, IRRegion, RegionKind 12 12 import asm.ir 13 13 from asm.errors import ErrorCategory 14 - from cm_inst import ArithOp, LogicOp 15 - from tokens import Port 14 + from cm_inst import ArithOp, LogicOp, Port 16 15 17 16 18 17 class TestBasicAutoPlacement:

+5 -5

tests/test_codegen.py

··· 23 23 SourceLoc, 24 24 ResolvedDest, 25 25 ) 26 - from cm_inst import ArithOp, RoutingOp, Addr, MemOp, ALUInst, SMInst 27 - from tokens import Port, MonadToken, SMToken, CfgToken, CfgOp, LoadInstToken, RouteSetToken 26 + from cm_inst import ALUInst, Addr, ArithOp, CfgOp, MemOp, Port, RoutingOp, SMInst 27 + from tokens import CfgToken, LoadInstToken, MonadToken, RouteSetToken, SMToken 28 28 from emu.types import PEConfig, SMConfig 29 29 from sm_mod import Presence 30 30 ··· 360 360 elif isinstance(token, RouteSetToken): 361 361 assert isinstance(token.target, int) 362 362 assert isinstance(token.op, CfgOp) 363 - assert isinstance(token.pe_routes, tuple) 364 - assert isinstance(token.sm_routes, tuple) 363 + assert isinstance(token.pe_routes, frozenset) 364 + assert isinstance(token.sm_routes, frozenset) 365 365 elif isinstance(token, LoadInstToken): 366 366 assert isinstance(token.target, int) 367 367 assert isinstance(token.op, CfgOp) ··· 431 431 ] 432 432 assert len(route_set_tokens) == 1 433 433 token = route_set_tokens[0] 434 - assert token.pe_routes == (0,) 434 + assert token.pe_routes == frozenset({0}) 435 435 436 436 def test_multiple_data_defs_same_sm(self): 437 437 """Multiple data_defs targeting same SM produce single SMConfig.

+4 -4

tests/test_integration.py

··· 9 9 10 10 import simpy 11 11 12 - from cm_inst import ALUInst, ArithOp, RoutingOp, Addr, SMInst 12 + from cm_inst import ALUInst, Addr, ArithOp, CfgOp, MemOp, Port, RoutingOp, SMInst 13 13 from emu import build_topology, PEConfig, SMConfig 14 14 from sm_mod import Presence 15 - from tokens import CMToken, MonadToken, SMToken, MemOp, Port, DyadToken, CfgToken, CfgOp, LoadInstToken 15 + from tokens import CMToken, CfgToken, DyadToken, LoadInstToken, MonadToken, SMToken 16 16 17 17 18 18 class TestAC51IRAMInitialization: ··· 270 270 [], 271 271 ) 272 272 273 - # Verify all gen_counters are 0 (ctx_slots default is 4) 274 - assert sys.pes[0].gen_counters == [0, 0, 0, 0] 273 + # Verify all gen_counters are 0 (ctx_slots default is 16) 274 + assert sys.pes[0].gen_counters == [0] * 16 275 275 276 276 def test_gen_counters_with_custom_ctx_slots(self): 277 277 """gen_counters list length matches ctx_slots."""

+1 -2

tests/test_lower.py

··· 15 15 16 16 from asm.ir import RegionKind, SourceLoc 17 17 from asm.errors import ErrorCategory 18 - from cm_inst import ArithOp, LogicOp, RoutingOp 19 - from tokens import Port, MemOp 18 + from cm_inst import ArithOp, LogicOp, MemOp, Port, RoutingOp 20 19 21 20 22 21 class TestInstDef:

+2 -2

tests/test_network.py

··· 10 10 11 11 import simpy 12 12 13 - from cm_inst import Addr, ALUInst, ArithOp, MemOp, RoutingOp, SMInst 13 + from cm_inst import ALUInst, Addr, ArithOp, MemOp, Port, RoutingOp, SMInst 14 14 from emu import build_topology, PEConfig, SMConfig 15 15 from emu.pe import ProcessingElement 16 16 from sm_mod import Presence 17 - from tokens import CMToken, DyadToken, MonadToken, Port, SMToken 17 + from tokens import CMToken, DyadToken, MonadToken, SMToken 18 18 19 19 20 20 class TestAC41PEtoPERouting:

+22 -22

tests/test_opcodes.py

··· 58 58 assert MNEMONIC_TO_OP["merge"] == RoutingOp.MRGE 59 59 assert MNEMONIC_TO_OP["pass"] == RoutingOp.PASS 60 60 assert MNEMONIC_TO_OP["const"] == RoutingOp.CONST 61 - assert MNEMONIC_TO_OP["free"] == RoutingOp.FREE 61 + assert MNEMONIC_TO_OP["free_ctx"] == RoutingOp.FREE_CTX 62 62 63 63 def test_memory_opcodes(self): 64 64 """or1-asm.AC1.2: Memory opcodes map correctly.""" ··· 66 66 assert MNEMONIC_TO_OP["write"] == MemOp.WRITE 67 67 assert MNEMONIC_TO_OP["clear"] == MemOp.CLEAR 68 68 assert MNEMONIC_TO_OP["alloc"] == MemOp.ALLOC 69 - assert MNEMONIC_TO_OP["free_sm"] == MemOp.FREE 69 + assert MNEMONIC_TO_OP["free"] == MemOp.FREE 70 70 assert MNEMONIC_TO_OP["rd_inc"] == MemOp.RD_INC 71 71 assert MNEMONIC_TO_OP["rd_dec"] == MemOp.RD_DEC 72 72 assert MNEMONIC_TO_OP["cmp_sw"] == MemOp.CMP_SW ··· 98 98 # Switch 99 99 "sweq", "swgt", "swge", "swof", 100 100 # Control 101 - "gate", "sel", "merge", "pass", "const", "free", 101 + "gate", "sel", "merge", "pass", "const", "free_ctx", 102 102 # Memory 103 - "read", "write", "clear", "alloc", "free_sm", "rd_inc", "rd_dec", "cmp_sw", 103 + "read", "write", "clear", "alloc", "free", "rd_inc", "rd_dec", "cmp_sw", 104 104 # Configuration 105 105 "load_inst", "route_set", 106 106 ] ··· 123 123 # Switch 124 124 "sweq", "swgt", "swge", "swof", 125 125 # Control 126 - "gate", "sel", "merge", "pass", "const", "free", 126 + "gate", "sel", "merge", "pass", "const", "free_ctx", 127 127 # Memory 128 - "read", "write", "clear", "alloc", "free_sm", "rd_inc", "rd_dec", "cmp_sw", 128 + "read", "write", "clear", "alloc", "free", "rd_inc", "rd_dec", "cmp_sw", 129 129 # Configuration 130 130 "load_inst", "route_set", 131 131 ] ··· 162 162 # Logic 163 163 LogicOp.NOT, 164 164 # Routing 165 - RoutingOp.PASS, RoutingOp.CONST, RoutingOp.FREE, 165 + RoutingOp.PASS, RoutingOp.CONST, RoutingOp.FREE_CTX, 166 166 # Memory 167 167 MemOp.READ, MemOp.ALLOC, MemOp.FREE, MemOp.CLEAR, 168 168 MemOp.RD_INC, MemOp.RD_DEC, ··· 221 221 ArithOp.INC, ArithOp.DEC, 222 222 ArithOp.SHIFT_L, ArithOp.SHIFT_R, ArithOp.ASHFT_R, 223 223 LogicOp.NOT, 224 - RoutingOp.PASS, RoutingOp.CONST, RoutingOp.FREE, 224 + RoutingOp.PASS, RoutingOp.CONST, RoutingOp.FREE_CTX, 225 225 MemOp.READ, MemOp.ALLOC, MemOp.FREE, MemOp.CLEAR, 226 226 MemOp.RD_INC, MemOp.RD_DEC, 227 227 CfgOp.LOAD_INST, CfgOp.ROUTE_SET, ··· 273 273 ArithOp.INC, ArithOp.DEC, 274 274 ArithOp.SHIFT_L, ArithOp.SHIFT_R, ArithOp.ASHFT_R, 275 275 LogicOp.NOT, 276 - RoutingOp.PASS, RoutingOp.CONST, RoutingOp.FREE, 276 + RoutingOp.PASS, RoutingOp.CONST, RoutingOp.FREE_CTX, 277 277 MemOp.READ, MemOp.ALLOC, MemOp.FREE, MemOp.CLEAR, 278 278 MemOp.RD_INC, MemOp.RD_DEC, 279 279 CfgOp.LOAD_INST, CfgOp.ROUTE_SET, ··· 310 310 311 311 312 312 class TestFreeDisambiguation: 313 - """Verify free (ALU) and free_sm (SM) are distinct and round-trip correctly.""" 313 + """Verify free_ctx (ALU) and free (SM) are distinct and round-trip correctly.""" 314 314 315 - def test_free_is_routing_op(self): 316 - """The free mnemonic should map to ALU RoutingOp.FREE.""" 317 - assert MNEMONIC_TO_OP["free"] == RoutingOp.FREE 315 + def test_free_ctx_is_routing_op(self): 316 + """The free_ctx mnemonic should map to ALU RoutingOp.FREE_CTX.""" 317 + assert MNEMONIC_TO_OP["free_ctx"] == RoutingOp.FREE_CTX 318 318 319 - def test_free_sm_is_memop(self): 320 - """The free_sm mnemonic should map to SM MemOp.FREE.""" 321 - assert MNEMONIC_TO_OP["free_sm"] == MemOp.FREE 319 + def test_free_is_memop(self): 320 + """The free mnemonic should map to SM MemOp.FREE.""" 321 + assert MNEMONIC_TO_OP["free"] == MemOp.FREE 322 322 323 323 def test_both_free_round_trip(self): 324 - """Both free and free_sm should round-trip correctly.""" 325 - # free -> RoutingOp.FREE -> free 326 - assert OP_TO_MNEMONIC[RoutingOp.FREE] == "free" 327 - assert OP_TO_MNEMONIC[MemOp.FREE] == "free_sm" 324 + """Both free_ctx and free should round-trip correctly.""" 325 + # free_ctx -> RoutingOp.FREE_CTX -> free_ctx 326 + assert OP_TO_MNEMONIC[RoutingOp.FREE_CTX] == "free_ctx" 327 + assert OP_TO_MNEMONIC[MemOp.FREE] == "free" 328 328 329 329 def test_no_collision(self): 330 330 """Distinct enum types should not collide in reverse mapping.""" 331 - # RoutingOp.FREE and MemOp.FREE are different enum values 332 - assert RoutingOp.FREE != MemOp.FREE 331 + # RoutingOp.FREE_CTX and MemOp.FREE are different enum values 332 + assert RoutingOp.FREE_CTX != MemOp.FREE

+2 -2

tests/test_parser.py

··· 240 240 assert len(tree.children) == 1 241 241 assert tree.children[0].data == "inst_def" 242 242 243 - def test_free_sm_op(self, parser): 243 + def test_free_op(self, parser): 244 244 tree = parser.parse(dedent("""\ 245 - &cell <| free_sm 245 + &cell <| free 246 246 """)) 247 247 assert tree.data == "start" 248 248 assert len(tree.children) == 1

+26 -20

tests/test_pe.py

··· 17 17 import simpy 18 18 from hypothesis import given 19 19 20 - from cm_inst import Addr, ALUInst, RoutingOp, ArithOp, SMInst, MemOp 20 + from cm_inst import ALUInst, Addr, ArithOp, MemOp, Port, RoutingOp, SMInst 21 21 from emu.pe import ProcessingElement 22 22 from tests.conftest import dyad_token 23 - from tokens import DyadToken, MonadToken, Port 23 + from tokens import DyadToken, MonadToken 24 24 25 25 26 26 def _inject_token(pe, token): ··· 357 357 env = simpy.Environment() 358 358 359 359 # FREE instruction (always suppresses) 360 - iram = {0: ALUInst(op=RoutingOp.FREE, dest_l=None, dest_r=None, const=None)} 360 + iram = {0: ALUInst(op=RoutingOp.FREE_CTX, dest_l=None, dest_r=None, const=None)} 361 361 362 362 pe = ProcessingElement(env, 0, iram) 363 363 output_store = simpy.Store(env, capacity=10) ··· 470 470 471 471 def test_ac71_route_set_accepted_no_warning(self): 472 472 """AC7.1: ROUTE_SET CfgToken accepted without warning.""" 473 - from tokens import CfgOp, RouteSetToken 473 + from cm_inst import CfgOp 474 + from tokens import RouteSetToken 474 475 475 476 env = simpy.Environment() 476 477 pe = ProcessingElement(env, 0, {}) ··· 491 492 target=0, 492 493 addr=None, 493 494 op=CfgOp.ROUTE_SET, 494 - pe_routes=(0, 2), 495 - sm_routes=(0,), 495 + pe_routes=frozenset({0, 2}), 496 + sm_routes=frozenset({0}), 496 497 ) 497 498 498 499 # Call _handle_cfg ··· 507 508 508 509 def test_ac72_route_to_allowed_pe_succeeds(self): 509 510 """AC7.2: After ROUTE_SET, PE can route to allowed PE ID.""" 510 - from tokens import CfgOp, RouteSetToken 511 + from cm_inst import CfgOp 512 + from tokens import RouteSetToken 511 513 512 514 env = simpy.Environment() 513 515 ··· 535 537 target=0, 536 538 addr=None, 537 539 op=CfgOp.ROUTE_SET, 538 - pe_routes=(0, 2), 539 - sm_routes=(), 540 + pe_routes=frozenset({0, 2}), 541 + sm_routes=frozenset(), 540 542 ) 541 543 pe._handle_cfg(route_set_token) 542 544 ··· 560 562 561 563 def test_ac73_route_to_allowed_sm_succeeds(self): 562 564 """AC7.3: After ROUTE_SET, PE can route to allowed SM ID.""" 563 - from tokens import CfgOp, RouteSetToken 565 + from cm_inst import CfgOp 566 + from tokens import RouteSetToken 564 567 565 568 env = simpy.Environment() 566 569 ··· 586 589 target=0, 587 590 addr=None, 588 591 op=CfgOp.ROUTE_SET, 589 - pe_routes=(), 590 - sm_routes=(0,), 592 + pe_routes=frozenset(), 593 + sm_routes=frozenset({0}), 591 594 ) 592 595 pe._handle_cfg(route_set_token) 593 596 ··· 611 614 612 615 def test_ac74_route_to_unlisted_pe_raises_keyerror(self): 613 616 """AC7.4: After ROUTE_SET, routing to unlisted PE ID raises KeyError.""" 614 - from tokens import CfgOp, RouteSetToken 617 + from cm_inst import CfgOp 618 + from tokens import RouteSetToken 615 619 616 620 env = simpy.Environment() 617 621 ··· 636 640 target=0, 637 641 addr=None, 638 642 op=CfgOp.ROUTE_SET, 639 - pe_routes=(0, 2), 640 - sm_routes=(), 643 + pe_routes=frozenset({0, 2}), 644 + sm_routes=frozenset(), 641 645 ) 642 646 pe._handle_cfg(route_set_token) 643 647 ··· 657 661 658 662 def test_ac75_route_to_unlisted_sm_raises_keyerror(self): 659 663 """AC7.5: After ROUTE_SET, routing to unlisted SM ID raises KeyError.""" 660 - from tokens import CfgOp, RouteSetToken 664 + from cm_inst import CfgOp 665 + from tokens import RouteSetToken 661 666 662 667 env = simpy.Environment() 663 668 ··· 681 686 target=0, 682 687 addr=None, 683 688 op=CfgOp.ROUTE_SET, 684 - pe_routes=(), 685 - sm_routes=(0,), 689 + pe_routes=frozenset(), 690 + sm_routes=frozenset({0}), 686 691 ) 687 692 pe._handle_cfg(route_set_token) 688 693 ··· 747 752 """FREE instruction (SUPPRESS mode) produces zero output tokens.""" 748 753 env = simpy.Environment() 749 754 # FREE is SUPPRESS mode 750 - iram = {0: ALUInst(op=RoutingOp.FREE, dest_l=None, dest_r=None, const=None)} 755 + iram = {0: ALUInst(op=RoutingOp.FREE_CTX, dest_l=None, dest_r=None, const=None)} 751 756 pe = ProcessingElement(env, 0, iram) 752 757 753 758 output_store = simpy.Store(env, capacity=10) ··· 926 931 927 932 def test_load_inst_at_non_zero_base_address(self): 928 933 """LOAD_INST CfgToken can load instructions at non-zero base address.""" 929 - from tokens import CfgOp, LoadInstToken 934 + from cm_inst import CfgOp 935 + from tokens import LoadInstToken 930 936 931 937 env = simpy.Environment() 932 938 pe = ProcessingElement(env, 0, {})

+1 -2

tests/test_serialize.py

··· 11 11 ) 12 12 from asm.serialize import serialize 13 13 from asm.opcodes import OP_TO_MNEMONIC 14 - from cm_inst import MemOp, ArithOp, LogicOp, RoutingOp 15 - from tokens import Port 14 + from cm_inst import ArithOp, LogicOp, MemOp, Port, RoutingOp 16 15 17 16 18 17 class TestRoundTrip:

+2 -1

tests/test_sm.py

··· 16 16 import simpy 17 17 from hypothesis import given, strategies as st 18 18 19 + from cm_inst import MemOp 19 20 from emu.sm import StructureMemory 20 21 from sm_mod import Presence 21 22 from tests.conftest import sm_token, sm_return_route, uint16 22 - from tokens import CMToken, MemOp, SMToken 23 + from tokens import CMToken, SMToken 23 24 24 25 25 26 def inject_token(env: simpy.Environment, store: simpy.Store, token):

+4 -26

tokens.py

··· 1 1 from dataclasses import dataclass 2 - from enum import Enum, IntEnum 3 2 from typing import List, Optional 4 3 5 - 6 - class Port(IntEnum): 7 - L = 0 8 - R = 1 4 + from cm_inst import ALUInst, CfgOp, MemOp, Port, SMInst 9 5 10 6 11 7 @dataclass(frozen=True) ··· 32 28 inline: bool 33 29 34 30 35 - class MemOp(IntEnum): 36 - READ = 0b000 37 - WRITE = 0b001 38 - ALLOC = 0b011 39 - FREE = 0b100 40 - CLEAR = 0b101 41 - # reserved 42 - RD_INC = 0b1100 43 - RD_DEC = 0b1101 44 - CMP_SW = 0b1110 45 - # reserved 46 - 47 - 48 31 @dataclass(frozen=True) 49 32 class SMToken(Token): 50 33 addr: int ··· 64 47 data: Optional[List[int]] 65 48 66 49 67 - class CfgOp(IntEnum): 68 - LOAD_INST = 0 69 - ROUTE_SET = 1 70 - 71 - 72 50 @dataclass(frozen=True) 73 51 class CfgToken(SysToken): 74 52 op: CfgOp ··· 76 54 77 55 @dataclass(frozen=True) 78 56 class LoadInstToken(CfgToken): 79 - instructions: tuple # tuple[ALUInst | SMInst, ...] 57 + instructions: tuple[ALUInst | SMInst, ...] 80 58 81 59 82 60 @dataclass(frozen=True) 83 61 class RouteSetToken(CfgToken): 84 - pe_routes: tuple # tuple[int, ...] 85 - sm_routes: tuple # tuple[int, ...] 62 + pe_routes: frozenset[int] 63 + sm_routes: frozenset[int]