added zmath · altagos.dev/rayray@323a478

+8

build.zig

··· 9 9 .target = target, 10 10 .optimize = optimize, 11 11 }); 12 + add_deps(b, rayray); 12 13 13 14 const exe = b.addExecutable(.{ 14 15 .name = "rayray", ··· 40 41 const test_step = b.step("test", "Run unit tests"); 41 42 test_step.dependOn(&run_lib_unit_tests.step); 42 43 } 44 + 45 + fn add_deps(b: *std.Build, module: *std.Build.Module) void { 46 + const zmath_pkg = b.dependency("zmath", .{ 47 + .enable_cross_platform_determinism = true, 48 + }); 49 + module.addImport("zmath", zmath_pkg.module("zmath")); 50 + }

+1

build.zig.zon

··· 4 4 .minimum_zig_version = "0.12.0-dev.2631+3069669bc", 5 5 .dependencies = .{ 6 6 // See `zig fetch --save <url>` for a command-line interface for adding dependencies. 7 + .zmath = .{ .path = "libs/zmath" }, 7 8 }, 8 9 .paths = .{ 9 10 "",

+141

libs/zmath/README.md

··· 1 + # zmath v0.9.6 - SIMD math library for game developers 2 + 3 + Tested on x86_64 and AArch64. 4 + 5 + Provides ~140 optimized routines and ~70 extensive tests. 6 + 7 + Can be used with any graphics API. 8 + 9 + Documentation can be found [here](https://github.com/michal-z/zig-gamedev/blob/main/libs/zmath/src/zmath.zig). 10 + 11 + Benchamrks can be found [here](https://github.com/michal-z/zig-gamedev/blob/main/libs/zmath/src/benchmark.zig). 12 + 13 + An intro article can be found [here](https://zig.news/michalz/fast-multi-platform-simd-math-library-in-zig-2adn). 14 + 15 + ## Getting started 16 + 17 + Copy `zmath` folder to a `libs` subdirectory of the root of your project and add the following to your `build.zig.zon` .dependencies: 18 + ```zig 19 + .zmath = .{ .path = "libs/zmath" }, 20 + ``` 21 + 22 + Then in your `build.zig` add: 23 + 24 + ```zig 25 + const std = @import("std"); 26 + const zmath = @import("zmath"); 27 + 28 + pub fn build(b: *std.Build) void { 29 + ... 30 + const optimize = b.standardOptimizeOption(.{}); 31 + const target = b.standardTargetOptions(.{}); 32 + 33 + const zmath_pkg = zmath.package(b, target, optimize, .{ 34 + .options = .{ .enable_cross_platform_determinism = true }, 35 + }); 36 + 37 + zmath_pkg.link(exe); 38 + } 39 + ``` 40 + 41 + Now in your code you may import and use zmath: 42 + 43 + ```zig 44 + const zm = @import("zmath"); 45 + 46 + pub fn main() !void { 47 + // 48 + // OpenGL/Vulkan example 49 + // 50 + const object_to_world = zm.rotationY(..); 51 + const world_to_view = zm.lookAtRh( 52 + zm.f32x4(3.0, 3.0, 3.0, 1.0), // eye position 53 + zm.f32x4(0.0, 0.0, 0.0, 1.0), // focus point 54 + zm.f32x4(0.0, 1.0, 0.0, 0.0), // up direction ('w' coord is zero because this is a vector not a point) 55 + ); 56 + // `perspectiveFovRhGl` produces Z values in [-1.0, 1.0] range (Vulkan app should use `perspectiveFovRh`) 57 + const view_to_clip = zm.perspectiveFovRhGl(0.25 * math.pi, aspect_ratio, 0.1, 20.0); 58 + 59 + const object_to_view = zm.mul(object_to_world, world_to_view); 60 + const object_to_clip = zm.mul(object_to_view, view_to_clip); 61 + 62 + // Transposition is needed because GLSL uses column-major matrices by default 63 + gl.uniformMatrix4fv(0, 1, gl.TRUE, zm.arrNPtr(&object_to_clip)); 64 + 65 + // In GLSL: gl_Position = vec4(in_position, 1.0) * object_to_clip; 66 + 67 + // 68 + // DirectX example 69 + // 70 + const object_to_world = zm.rotationY(..); 71 + const world_to_view = zm.lookAtLh( 72 + zm.f32x4(3.0, 3.0, -3.0, 1.0), // eye position 73 + zm.f32x4(0.0, 0.0, 0.0, 1.0), // focus point 74 + zm.f32x4(0.0, 1.0, 0.0, 0.0), // up direction ('w' coord is zero because this is a vector not a point) 75 + ); 76 + const view_to_clip = zm.perspectiveFovLh(0.25 * math.pi, aspect_ratio, 0.1, 20.0); 77 + 78 + const object_to_view = zm.mul(object_to_world, world_to_view); 79 + const object_to_clip = zm.mul(object_to_view, view_to_clip); 80 + 81 + // Transposition is needed because HLSL uses column-major matrices by default 82 + const mem = allocateUploadMemory(...); 83 + zm.storeMat(mem, zm.transpose(object_to_clip)); 84 + 85 + // In HLSL: out_position_sv = mul(float4(in_position, 1.0), object_to_clip); 86 + 87 + // 88 + // 'WASD' camera movement example 89 + // 90 + { 91 + const speed = zm.f32x4s(10.0); 92 + const delta_time = zm.f32x4s(demo.frame_stats.delta_time); 93 + const transform = zm.mul(zm.rotationX(demo.camera.pitch), zm.rotationY(demo.camera.yaw)); 94 + var forward = zm.normalize3(zm.mul(zm.f32x4(0.0, 0.0, 1.0, 0.0), transform)); 95 + 96 + zm.storeArr3(&demo.camera.forward, forward); 97 + 98 + const right = speed * delta_time * zm.normalize3(zm.cross3(zm.f32x4(0.0, 1.0, 0.0, 0.0), forward)); 99 + forward = speed * delta_time * forward; 100 + 101 + var cam_pos = zm.loadArr3(demo.camera.position); 102 + 103 + if (keyDown('W')) { 104 + cam_pos += forward; 105 + } else if (keyDown('S')) { 106 + cam_pos -= forward; 107 + } 108 + if (keyDown('D')) { 109 + cam_pos += right; 110 + } else if (keyDown('A')) { 111 + cam_pos -= right; 112 + } 113 + 114 + zm.storeArr3(&demo.camera.position, cam_pos); 115 + } 116 + 117 + // 118 + // SIMD wave equation solver example (works with vector width 4, 8 and 16) 119 + // 'T' can be F32x4, F32x8 or F32x16 120 + // 121 + var z_index: i32 = 0; 122 + while (z_index < grid_size) : (z_index += 1) { 123 + const z = scale * @intToFloat(f32, z_index - grid_size / 2); 124 + const vz = zm.splat(T, z); 125 + 126 + var x_index: i32 = 0; 127 + while (x_index < grid_size) : (x_index += zm.veclen(T)) { 128 + const x = scale * @intToFloat(f32, x_index - grid_size / 2); 129 + const vx = zm.splat(T, x) + voffset * zm.splat(T, scale); 130 + 131 + const d = zm.sqrt(vx * vx + vz * vz); 132 + const vy = zm.sin(d - vtime); 133 + 134 + const index = @intCast(usize, x_index + z_index * grid_size); 135 + zm.store(xslice[index..], vx, 0); 136 + zm.store(yslice[index..], vy, 0); 137 + zm.store(zslice[index..], vz, 0); 138 + } 139 + } 140 + } 141 + ```

+103

libs/zmath/build.zig

··· 1 + const std = @import("std"); 2 + 3 + pub const Options = struct { 4 + enable_cross_platform_determinism: bool = true, 5 + }; 6 + 7 + pub const Package = struct { 8 + target: std.Build.ResolvedTarget, 9 + options: Options, 10 + zmath: *std.Build.Module, 11 + zmath_options: *std.Build.Module, 12 + 13 + pub fn link(pkg: Package, exe: *std.Build.Step.Compile) void { 14 + exe.root_module.addImport("zmath", pkg.zmath); 15 + exe.root_module.addImport("zmath_options", pkg.zmath_options); 16 + } 17 + }; 18 + 19 + pub fn package( 20 + b: *std.Build, 21 + target: std.Build.ResolvedTarget, 22 + _: std.builtin.Mode, 23 + args: struct { 24 + options: Options = .{}, 25 + }, 26 + ) Package { 27 + const step = b.addOptions(); 28 + step.addOption( 29 + bool, 30 + "enable_cross_platform_determinism", 31 + args.options.enable_cross_platform_determinism, 32 + ); 33 + 34 + const zmath_options = step.createModule(); 35 + 36 + const zmath = b.addModule("zmath", .{ 37 + .root_source_file = .{ .path = thisDir() ++ "/src/main.zig" }, 38 + .imports = &.{ 39 + .{ .name = "zmath_options", .module = zmath_options }, 40 + }, 41 + }); 42 + 43 + return .{ 44 + .target = target, 45 + .options = args.options, 46 + .zmath = zmath, 47 + .zmath_options = zmath_options, 48 + }; 49 + } 50 + 51 + pub fn build(b: *std.Build) void { 52 + const optimize = b.standardOptimizeOption(.{}); 53 + const target = b.standardTargetOptions(.{}); 54 + 55 + _ = package(b, target, optimize, .{ .options = .{ 56 + .enable_cross_platform_determinism = b.option(bool, "enable_cross_platform_determinism", "Whether to enable cross-platform determinism.") orelse true, 57 + } }); 58 + 59 + const test_step = b.step("test", "Run zmath tests"); 60 + test_step.dependOn(runTests(b, optimize, target)); 61 + 62 + const benchmark_step = b.step("benchmark", "Run zmath benchmarks"); 63 + benchmark_step.dependOn(runBenchmarks(b, target)); 64 + } 65 + 66 + pub fn runTests( 67 + b: *std.Build, 68 + optimize: std.builtin.Mode, 69 + target: std.Build.ResolvedTarget, 70 + ) *std.Build.Step { 71 + const tests = b.addTest(.{ 72 + .name = "zmath-tests", 73 + .root_source_file = .{ .path = thisDir() ++ "/src/main.zig" }, 74 + .target = target, 75 + .optimize = optimize, 76 + }); 77 + 78 + const zmath_pkg = package(b, target, optimize, .{}); 79 + tests.root_module.addImport("zmath_options", zmath_pkg.zmath_options); 80 + 81 + return &b.addRunArtifact(tests).step; 82 + } 83 + 84 + pub fn runBenchmarks( 85 + b: *std.Build, 86 + target: std.Build.ResolvedTarget, 87 + ) *std.Build.Step { 88 + const exe = b.addExecutable(.{ 89 + .name = "zmath-benchmarks", 90 + .root_source_file = .{ .path = thisDir() ++ "/src/benchmark.zig" }, 91 + .target = target, 92 + .optimize = .ReleaseFast, 93 + }); 94 + 95 + const zmath_pkg = package(b, target, .ReleaseFast, .{}); 96 + exe.root_module.addImport("zmath", zmath_pkg.zmath); 97 + 98 + return &b.addRunArtifact(exe).step; 99 + } 100 + 101 + inline fn thisDir() []const u8 { 102 + return comptime std.fs.path.dirname(@src().file) orelse "."; 103 + }

+10

libs/zmath/build.zig.zon

··· 1 + .{ 2 + .name = "zmath", 3 + .version = "0.9.6", 4 + .paths = .{ 5 + "build.zig", 6 + "build.zig.zon", 7 + "src", 8 + "README.md", 9 + }, 10 + }

+469

libs/zmath/src/benchmark.zig

··· 1 + // ------------------------------------------------------------------------------------------------- 2 + // zmath - benchmarks 3 + // ------------------------------------------------------------------------------------------------- 4 + // 'zig build benchmark' in the root project directory will build and run 'ReleaseFast' configuration. 5 + // 6 + // ------------------------------------------------------------------------------------------------- 7 + // 'AMD Ryzen 9 3950X 16-Core Processor', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f 8 + // ------------------------------------------------------------------------------------------------- 9 + // matrix mul benchmark (AOS) - scalar version: 1.5880s, zmath version: 1.0642s 10 + // cross3, scale, bias benchmark (AOS) - scalar version: 0.9318s, zmath version: 0.6888s 11 + // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.2258s, zmath version: 1.1095s 12 + // quaternion mul benchmark (AOS) - scalar version: 1.4123s, zmath version: 0.6958s 13 + // wave benchmark (SOA) - scalar version: 4.8165s, zmath version: 0.7338s 14 + // 15 + // ------------------------------------------------------------------------------------------------- 16 + // 'AMD Ryzen 7 5800X 8-Core Processer', Linux 5.17.14, Zig 0.10.0-dev.2624+d506275a0 17 + // ------------------------------------------------------------------------------------------------- 18 + // matrix mul benchmark (AOS) - scalar version: 1.3672s, zmath version: 0.8617s 19 + // cross3, scale, bias benchmark (AOS) - scalar version: 0.6586s, zmath version: 0.4803s 20 + // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.0620s, zmath version: 0.8942s 21 + // quaternion mul benchmark (AOS) - scalar version: 1.1324s, zmath version: 0.6064s 22 + // wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s 23 + // 24 + // ------------------------------------------------------------------------------------------------- 25 + // 'Apple M1 Max', macOS Version 12.4, Zig 0.10.0-dev.2657+74442f350 26 + // ------------------------------------------------------------------------------------------------- 27 + // matrix mul benchmark (AOS) - scalar version: 1.0297s, zmath version: 1.0538s 28 + // cross3, scale, bias benchmark (AOS) - scalar version: 0.6294s, zmath version: 0.6532s 29 + // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9807s, zmath version: 1.0988s 30 + // quaternion mul benchmark (AOS) - scalar version: 1.5413s, zmath version: 0.7800s 31 + // wave benchmark (SOA) - scalar version: 3.4220s, zmath version: 1.0255s 32 + // 33 + // ------------------------------------------------------------------------------------------------- 34 + // '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f 35 + // ------------------------------------------------------------------------------------------------- 36 + // matrix mul benchmark (AOS) - scalar version: 2.2308s, zmath version: 0.9376s 37 + // cross3, scale, bias benchmark (AOS) - scalar version: 1.0821s, zmath version: 0.5110s 38 + // cross3, dot3, scale, bias benchmark (AOS) - scalar version: 1.6580s, zmath version: 0.9167s 39 + // quaternion mul benchmark (AOS) - scalar version: 2.0139s, zmath version: 0.5856s 40 + // wave benchmark (SOA) - scalar version: 3.7832s, zmath version: 0.3642s 41 + // 42 + // ------------------------------------------------------------------------------------------------- 43 + 44 + pub fn main() !void { 45 + var gpa = std.heap.GeneralPurposeAllocator(.{}){}; 46 + defer _ = gpa.deinit(); 47 + const allocator = gpa.allocator(); 48 + 49 + // m = mul(ma, mb); data set fits in L1 cache; AOS data layout. 50 + try mat4MulBenchmark(allocator, 100_000); 51 + 52 + // v = 0.01 * cross3(va, vb) + vec3(1.0); data set fits in L1 cache; AOS data layout. 53 + try cross3ScaleBiasBenchmark(allocator, 10_000); 54 + 55 + // v = dot3(va, vb) * (0.1 * cross3(va, vb) + vec3(1.0)); data set fits in L1 cache; AOS data layout. 56 + try cross3Dot3ScaleBiasBenchmark(allocator, 10_000); 57 + 58 + // q = qmul(qa, qb); data set fits in L1 cache; AOS data layout. 59 + try quatBenchmark(allocator, 10_000); 60 + 61 + // d = sqrt(x * x + z * z); y = sin(d - t); SOA layout. 62 + try waveBenchmark(allocator, 1_000); 63 + } 64 + 65 + const std = @import("std"); 66 + const time = std.time; 67 + const Timer = time.Timer; 68 + const zm = @import("zmath"); 69 + 70 + var prng = std.rand.DefaultPrng.init(0); 71 + const random = prng.random(); 72 + 73 + noinline fn mat4MulBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 74 + std.debug.print("\n", .{}); 75 + std.debug.print("{s:>42} - ", .{"matrix mul benchmark (AOS)"}); 76 + 77 + var data0 = std.ArrayList([16]f32).init(allocator); 78 + defer data0.deinit(); 79 + var data1 = std.ArrayList([16]f32).init(allocator); 80 + defer data1.deinit(); 81 + 82 + var i: usize = 0; 83 + while (i < 64) : (i += 1) { 84 + try data0.append([16]f32{ 85 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 86 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 87 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 88 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 89 + }); 90 + try data1.append([16]f32{ 91 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 92 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 93 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 94 + random.float(f32), random.float(f32), random.float(f32), random.float(f32), 95 + }); 96 + } 97 + 98 + // Warmup, fills L1 cache. 99 + i = 0; 100 + while (i < 100) : (i += 1) { 101 + for (data1.items) |b| { 102 + for (data0.items) |a| { 103 + const ma = zm.loadMat(a[0..]); 104 + const mb = zm.loadMat(b[0..]); 105 + const r = zm.mul(ma, mb); 106 + std.mem.doNotOptimizeAway(&r); 107 + } 108 + } 109 + } 110 + 111 + { 112 + i = 0; 113 + var timer = try Timer.start(); 114 + const start = timer.lap(); 115 + while (i < count) : (i += 1) { 116 + for (data1.items) |b| { 117 + for (data0.items) |a| { 118 + const r = [16]f32{ 119 + a[0] * b[0] + a[1] * b[4] + a[2] * b[8] + a[3] * b[12], 120 + a[0] * b[1] + a[1] * b[5] + a[2] * b[9] + a[3] * b[13], 121 + a[0] * b[2] + a[1] * b[6] + a[2] * b[10] + a[3] * b[14], 122 + a[0] * b[3] + a[1] * b[7] + a[2] * b[11] + a[3] * b[15], 123 + a[4] * b[0] + a[5] * b[4] + a[6] * b[8] + a[7] * b[12], 124 + a[4] * b[1] + a[5] * b[5] + a[6] * b[9] + a[7] * b[13], 125 + a[4] * b[2] + a[5] * b[6] + a[6] * b[10] + a[7] * b[14], 126 + a[4] * b[3] + a[5] * b[7] + a[6] * b[11] + a[7] * b[15], 127 + a[8] * b[0] + a[9] * b[4] + a[10] * b[8] + a[11] * b[12], 128 + a[8] * b[1] + a[9] * b[5] + a[10] * b[9] + a[11] * b[13], 129 + a[8] * b[2] + a[9] * b[6] + a[10] * b[10] + a[11] * b[14], 130 + a[8] * b[3] + a[9] * b[7] + a[10] * b[11] + a[11] * b[15], 131 + a[12] * b[0] + a[13] * b[4] + a[14] * b[8] + a[15] * b[12], 132 + a[12] * b[1] + a[13] * b[5] + a[14] * b[9] + a[15] * b[13], 133 + a[12] * b[2] + a[13] * b[6] + a[14] * b[10] + a[15] * b[14], 134 + a[12] * b[3] + a[13] * b[7] + a[14] * b[11] + a[15] * b[15], 135 + }; 136 + std.mem.doNotOptimizeAway(&r); 137 + } 138 + } 139 + } 140 + const end = timer.read(); 141 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 142 + 143 + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 144 + } 145 + 146 + { 147 + i = 0; 148 + var timer = try Timer.start(); 149 + const start = timer.lap(); 150 + while (i < count) : (i += 1) { 151 + for (data1.items) |b| { 152 + for (data0.items) |a| { 153 + const ma = zm.loadMat(a[0..]); 154 + const mb = zm.loadMat(b[0..]); 155 + const r = zm.mul(ma, mb); 156 + std.mem.doNotOptimizeAway(&r); 157 + } 158 + } 159 + } 160 + const end = timer.read(); 161 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 162 + 163 + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 164 + } 165 + } 166 + 167 + noinline fn cross3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 168 + std.debug.print("{s:>42} - ", .{"cross3, scale, bias benchmark (AOS)"}); 169 + 170 + var data0 = std.ArrayList([3]f32).init(allocator); 171 + defer data0.deinit(); 172 + var data1 = std.ArrayList([3]f32).init(allocator); 173 + defer data1.deinit(); 174 + 175 + var i: usize = 0; 176 + while (i < 256) : (i += 1) { 177 + try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 178 + try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 179 + } 180 + 181 + // Warmup, fills L1 cache. 182 + i = 0; 183 + while (i < 100) : (i += 1) { 184 + for (data1.items) |b| { 185 + for (data0.items) |a| { 186 + const va = zm.loadArr3(a); 187 + const vb = zm.loadArr3(b); 188 + const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); 189 + std.mem.doNotOptimizeAway(&cp); 190 + } 191 + } 192 + } 193 + 194 + { 195 + i = 0; 196 + var timer = try Timer.start(); 197 + const start = timer.lap(); 198 + while (i < count) : (i += 1) { 199 + for (data1.items) |b| { 200 + for (data0.items) |a| { 201 + const r = [3]f32{ 202 + 0.01 * (a[1] * b[2] - a[2] * b[1]) + 1.0, 203 + 0.01 * (a[2] * b[0] - a[0] * b[2]) + 1.0, 204 + 0.01 * (a[0] * b[1] - a[1] * b[0]) + 1.0, 205 + }; 206 + std.mem.doNotOptimizeAway(&r); 207 + } 208 + } 209 + } 210 + const end = timer.read(); 211 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 212 + 213 + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 214 + } 215 + 216 + { 217 + i = 0; 218 + var timer = try Timer.start(); 219 + const start = timer.lap(); 220 + while (i < count) : (i += 1) { 221 + for (data1.items) |b| { 222 + for (data0.items) |a| { 223 + const va = zm.loadArr3(a); 224 + const vb = zm.loadArr3(b); 225 + const cp = zm.f32x4s(0.01) * zm.cross3(va, vb) + zm.f32x4s(1.0); 226 + std.mem.doNotOptimizeAway(&cp); 227 + } 228 + } 229 + } 230 + const end = timer.read(); 231 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 232 + 233 + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 234 + } 235 + } 236 + 237 + noinline fn cross3Dot3ScaleBiasBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 238 + std.debug.print("{s:>42} - ", .{"cross3, dot3, scale, bias benchmark (AOS)"}); 239 + 240 + var data0 = std.ArrayList([3]f32).init(allocator); 241 + defer data0.deinit(); 242 + var data1 = std.ArrayList([3]f32).init(allocator); 243 + defer data1.deinit(); 244 + 245 + var i: usize = 0; 246 + while (i < 256) : (i += 1) { 247 + try data0.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 248 + try data1.append([3]f32{ random.float(f32), random.float(f32), random.float(f32) }); 249 + } 250 + 251 + // Warmup, fills L1 cache. 252 + i = 0; 253 + while (i < 100) : (i += 1) { 254 + for (data1.items) |b| { 255 + for (data0.items) |a| { 256 + const va = zm.loadArr3(a); 257 + const vb = zm.loadArr3(b); 258 + const r = (zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)))[0]; 259 + std.mem.doNotOptimizeAway(&r); 260 + } 261 + } 262 + } 263 + 264 + { 265 + i = 0; 266 + var timer = try Timer.start(); 267 + const start = timer.lap(); 268 + while (i < count) : (i += 1) { 269 + for (data1.items) |b| { 270 + for (data0.items) |a| { 271 + const d = a[0] * b[0] + a[1] * b[1] + a[2] * b[2]; 272 + const r = [3]f32{ 273 + d * (0.1 * (a[1] * b[2] - a[2] * b[1]) + 1.0), 274 + d * (0.1 * (a[2] * b[0] - a[0] * b[2]) + 1.0), 275 + d * (0.1 * (a[0] * b[1] - a[1] * b[0]) + 1.0), 276 + }; 277 + std.mem.doNotOptimizeAway(&r); 278 + } 279 + } 280 + } 281 + const end = timer.read(); 282 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 283 + 284 + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 285 + } 286 + 287 + { 288 + i = 0; 289 + var timer = try Timer.start(); 290 + const start = timer.lap(); 291 + while (i < count) : (i += 1) { 292 + for (data1.items) |b| { 293 + for (data0.items) |a| { 294 + const va = zm.loadArr3(a); 295 + const vb = zm.loadArr3(b); 296 + const r = zm.dot3(va, vb) * (zm.f32x4s(0.1) * zm.cross3(va, vb) + zm.f32x4s(1.0)); 297 + std.mem.doNotOptimizeAway(&r); 298 + } 299 + } 300 + } 301 + const end = timer.read(); 302 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 303 + 304 + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 305 + } 306 + } 307 + 308 + noinline fn quatBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 309 + std.debug.print("{s:>42} - ", .{"quaternion mul benchmark (AOS)"}); 310 + 311 + var data0 = std.ArrayList([4]f32).init(allocator); 312 + defer data0.deinit(); 313 + var data1 = std.ArrayList([4]f32).init(allocator); 314 + defer data1.deinit(); 315 + 316 + var i: usize = 0; 317 + while (i < 256) : (i += 1) { 318 + try data0.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); 319 + try data1.append([4]f32{ random.float(f32), random.float(f32), random.float(f32), random.float(f32) }); 320 + } 321 + 322 + // Warmup, fills L1 cache. 323 + i = 0; 324 + while (i < 100) : (i += 1) { 325 + for (data1.items) |b| { 326 + for (data0.items) |a| { 327 + const va = zm.loadArr4(a); 328 + const vb = zm.loadArr4(b); 329 + const r = zm.qmul(va, vb); 330 + std.mem.doNotOptimizeAway(&r); 331 + } 332 + } 333 + } 334 + 335 + { 336 + i = 0; 337 + var timer = try Timer.start(); 338 + const start = timer.lap(); 339 + while (i < count) : (i += 1) { 340 + for (data1.items) |b| { 341 + for (data0.items) |a| { 342 + const r = [4]f32{ 343 + (b[3] * a[0]) + (b[0] * a[3]) + (b[1] * a[2]) - (b[2] * a[1]), 344 + (b[3] * a[1]) - (b[0] * a[2]) + (b[1] * a[3]) + (b[2] * a[0]), 345 + (b[3] * a[2]) + (b[0] * a[1]) - (b[1] * a[0]) + (b[2] * a[3]), 346 + (b[3] * a[3]) - (b[0] * a[0]) - (b[1] * a[1]) - (b[2] * a[2]), 347 + }; 348 + std.mem.doNotOptimizeAway(&r); 349 + } 350 + } 351 + } 352 + const end = timer.read(); 353 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 354 + 355 + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 356 + } 357 + 358 + { 359 + i = 0; 360 + var timer = try Timer.start(); 361 + const start = timer.lap(); 362 + while (i < count) : (i += 1) { 363 + for (data1.items) |b| { 364 + for (data0.items) |a| { 365 + const va = zm.loadArr4(a); 366 + const vb = zm.loadArr4(b); 367 + const r = zm.qmul(va, vb); 368 + std.mem.doNotOptimizeAway(&r); 369 + } 370 + } 371 + } 372 + const end = timer.read(); 373 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 374 + 375 + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 376 + } 377 + } 378 + 379 + noinline fn waveBenchmark(allocator: std.mem.Allocator, comptime count: comptime_int) !void { 380 + _ = allocator; 381 + std.debug.print("{s:>42} - ", .{"wave benchmark (SOA)"}); 382 + 383 + const grid_size = 1024; 384 + { 385 + var t: f32 = 0.0; 386 + 387 + const scale: f32 = 0.05; 388 + 389 + var timer = try Timer.start(); 390 + const start = timer.lap(); 391 + 392 + var iter: usize = 0; 393 + while (iter < count) : (iter += 1) { 394 + var z_index: i32 = 0; 395 + while (z_index < grid_size) : (z_index += 1) { 396 + const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); 397 + 398 + var x_index: i32 = 0; 399 + while (x_index < grid_size) : (x_index += 4) { 400 + const x0 = scale * @as(f32, @floatFromInt(x_index + 0 - grid_size / 2)); 401 + const x1 = scale * @as(f32, @floatFromInt(x_index + 1 - grid_size / 2)); 402 + const x2 = scale * @as(f32, @floatFromInt(x_index + 2 - grid_size / 2)); 403 + const x3 = scale * @as(f32, @floatFromInt(x_index + 3 - grid_size / 2)); 404 + 405 + const d0 = zm.sqrt(x0 * x0 + z * z); 406 + const d1 = zm.sqrt(x1 * x1 + z * z); 407 + const d2 = zm.sqrt(x2 * x2 + z * z); 408 + const d3 = zm.sqrt(x3 * x3 + z * z); 409 + 410 + const y0 = zm.sin(d0 - t); 411 + const y1 = zm.sin(d1 - t); 412 + const y2 = zm.sin(d2 - t); 413 + const y3 = zm.sin(d3 - t); 414 + 415 + std.mem.doNotOptimizeAway(&y0); 416 + std.mem.doNotOptimizeAway(&y1); 417 + std.mem.doNotOptimizeAway(&y2); 418 + std.mem.doNotOptimizeAway(&y3); 419 + } 420 + } 421 + t += 0.001; 422 + } 423 + const end = timer.read(); 424 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 425 + 426 + std.debug.print("scalar version: {d:.4}s, ", .{elapsed_s}); 427 + } 428 + 429 + { 430 + const T = zm.F32x16; 431 + 432 + const static = struct { 433 + const offsets = [16]f32{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; 434 + }; 435 + const voffset = zm.load(static.offsets[0..], T, 0); 436 + var vt = zm.splat(T, 0.0); 437 + 438 + const scale: f32 = 0.05; 439 + 440 + var timer = try Timer.start(); 441 + const start = timer.lap(); 442 + 443 + var iter: usize = 0; 444 + while (iter < count) : (iter += 1) { 445 + var z_index: i32 = 0; 446 + while (z_index < grid_size) : (z_index += 1) { 447 + const z = scale * @as(f32, @floatFromInt(z_index - grid_size / 2)); 448 + const vz = zm.splat(T, z); 449 + 450 + var x_index: i32 = 0; 451 + while (x_index < grid_size) : (x_index += zm.veclen(T)) { 452 + const x = scale * @as(f32, @floatFromInt(x_index - grid_size / 2)); 453 + const vx = zm.splat(T, x) + voffset * zm.splat(T, scale); 454 + 455 + const d = zm.sqrt(vx * vx + vz * vz); 456 + 457 + const vy = zm.sin(d - vt); 458 + 459 + std.mem.doNotOptimizeAway(&vy); 460 + } 461 + } 462 + vt += zm.splat(T, 0.001); 463 + } 464 + const end = timer.read(); 465 + const elapsed_s = @as(f64, @floatFromInt(end - start)) / time.ns_per_s; 466 + 467 + std.debug.print("zmath version: {d:.4}s\n", .{elapsed_s}); 468 + } 469 + }

+16

libs/zmath/src/main.zig

··· 1 + //-------------------------------------------------------------------------------------------------- 2 + // 3 + // SIMD math library for game developers 4 + // https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath 5 + // 6 + // See zmath.zig for more details. 7 + // See util.zig for additional functionality. 8 + // 9 + //-------------------------------------------------------------------------------------------------- 10 + pub usingnamespace @import("zmath.zig"); 11 + pub const util = @import("util.zig"); 12 + 13 + // ensure transitive closure of test coverage 14 + comptime { 15 + _ = util; 16 + }

+182

libs/zmath/src/util.zig

··· 1 + // ============================================================================== 2 + // 3 + // Collection of useful functions building on top of, and extending, core zmath. 4 + // https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath 5 + // 6 + // ------------------------------------------------------------------------------ 7 + // 1. Matrix functions 8 + // ------------------------------------------------------------------------------ 9 + // 10 + // As an example, in a left handed Y-up system: 11 + // getAxisX is equivalent to the right vector 12 + // getAxisY is equivalent to the up vector 13 + // getAxisZ is equivalent to the forward vector 14 + // 15 + // getTranslationVec(m: Mat) Vec 16 + // getAxisX(m: Mat) Vec 17 + // getAxisY(m: Mat) Vec 18 + // getAxisZ(m: Mat) Vec 19 + // 20 + // ============================================================================== 21 + 22 + const zm = @import("zmath.zig"); 23 + const std = @import("std"); 24 + const math = std.math; 25 + const expect = std.testing.expect; 26 + 27 + pub fn getTranslationVec(m: zm.Mat) zm.Vec { 28 + var translation = m[3]; 29 + translation[3] = 0; 30 + return translation; 31 + } 32 + 33 + pub fn getScaleVec(m: zm.Mat) zm.Vec { 34 + const scale_x = zm.length3(zm.f32x4(m[0][0], m[1][0], m[2][0], 0))[0]; 35 + const scale_y = zm.length3(zm.f32x4(m[0][1], m[1][1], m[2][1], 0))[0]; 36 + const scale_z = zm.length3(zm.f32x4(m[0][2], m[1][2], m[2][2], 0))[0]; 37 + return zm.f32x4(scale_x, scale_y, scale_z, 0); 38 + } 39 + 40 + pub fn getRotationQuat(_m: zm.Mat) zm.Quat { 41 + // Ortho normalize given matrix. 42 + const c1 = zm.normalize3(zm.f32x4(_m[0][0], _m[1][0], _m[2][0], 0)); 43 + const c2 = zm.normalize3(zm.f32x4(_m[0][1], _m[1][1], _m[2][1], 0)); 44 + const c3 = zm.normalize3(zm.f32x4(_m[0][2], _m[1][2], _m[2][2], 0)); 45 + var m = _m; 46 + m[0][0] = c1[0]; 47 + m[1][0] = c1[1]; 48 + m[2][0] = c1[2]; 49 + m[0][1] = c2[0]; 50 + m[1][1] = c2[1]; 51 + m[2][1] = c2[2]; 52 + m[0][2] = c3[0]; 53 + m[1][2] = c3[1]; 54 + m[2][2] = c3[2]; 55 + 56 + // Extract rotation 57 + return zm.quatFromMat(m); 58 + } 59 + 60 + pub fn getAxisX(m: zm.Mat) zm.Vec { 61 + return zm.normalize3(zm.f32x4(m[0][0], m[0][1], m[0][2], 0.0)); 62 + } 63 + 64 + pub fn getAxisY(m: zm.Mat) zm.Vec { 65 + return zm.normalize3(zm.f32x4(m[1][0], m[1][1], m[1][2], 0.0)); 66 + } 67 + 68 + pub fn getAxisZ(m: zm.Mat) zm.Vec { 69 + return zm.normalize3(zm.f32x4(m[2][0], m[2][1], m[2][2], 0.0)); 70 + } 71 + 72 + test "zmath.util.mat.translation" { 73 + // zig fmt: off 74 + const mat_data = [18]f32{ 75 + 1.0, 76 + 2.0, 3.0, 4.0, 5.0, 77 + 6.0, 7.0, 8.0, 9.0, 78 + 10.0,11.0, 12.0,13.0, 79 + 14.0, 15.0, 16.0, 17.0, 80 + 18.0, 81 + }; 82 + // zig fmt: on 83 + const mat = zm.loadMat(mat_data[1..]); 84 + const translation = getTranslationVec(mat); 85 + try expect(zm.approxEqAbs(translation, zm.f32x4(14.0, 15.0, 16.0, 0.0), 0.0001)); 86 + } 87 + 88 + test "zmath.util.mat.scale" { 89 + const mat = zm.mul(zm.scaling(3, 4, 5), zm.translation(6, 7, 8)); 90 + const scale = getScaleVec(mat); 91 + try expect(zm.approxEqAbs(scale, zm.f32x4(3.0, 4.0, 5.0, 0.0), 0.0001)); 92 + } 93 + 94 + test "zmath.util.mat.rotation" { 95 + const rotate_origin = zm.matFromRollPitchYaw(0.1, 1.2, 2.3); 96 + const mat = zm.mul(zm.mul(rotate_origin, zm.scaling(3, 4, 5)), zm.translation(6, 7, 8)); 97 + const rotate_get = getRotationQuat(mat); 98 + const v0 = zm.mul(zm.f32x4s(1), rotate_origin); 99 + const v1 = zm.mul(zm.f32x4s(1), zm.quatToMat(rotate_get)); 100 + try expect(zm.approxEqAbs(v0, v1, 0.0001)); 101 + } 102 + 103 + test "zmath.util.mat.z_vec" { 104 + const degToRad = std.math.degreesToRadians; 105 + var identity = zm.identity(); 106 + var z_vec = getAxisZ(identity); 107 + try expect(zm.approxEqAbs(z_vec, zm.f32x4(0.0, 0.0, 1.0, 0), 0.0001)); 108 + const rot_yaw = zm.rotationY(degToRad(f32, 90)); 109 + identity = zm.mul(identity, rot_yaw); 110 + z_vec = getAxisZ(identity); 111 + try expect(zm.approxEqAbs(z_vec, zm.f32x4(1.0, 0.0, 0.0, 0), 0.0001)); 112 + } 113 + 114 + test "zmath.util.mat.y_vec" { 115 + const degToRad = std.math.degreesToRadians; 116 + var identity = zm.identity(); 117 + var y_vec = getAxisY(identity); 118 + try expect(zm.approxEqAbs(y_vec, zm.f32x4(0.0, 1.0, 0.0, 0), 0.01)); 119 + const rot_yaw = zm.rotationY(degToRad(f32, 90)); 120 + identity = zm.mul(identity, rot_yaw); 121 + y_vec = getAxisY(identity); 122 + try expect(zm.approxEqAbs(y_vec, zm.f32x4(0.0, 1.0, 0.0, 0), 0.01)); 123 + const rot_pitch = zm.rotationX(degToRad(f32, 90)); 124 + identity = zm.mul(identity, rot_pitch); 125 + y_vec = getAxisY(identity); 126 + try expect(zm.approxEqAbs(y_vec, zm.f32x4(0.0, 0.0, 1.0, 0), 0.01)); 127 + } 128 + 129 + test "zmath.util.mat.right" { 130 + const degToRad = std.math.degreesToRadians; 131 + var identity = zm.identity(); 132 + var right = getAxisX(identity); 133 + try expect(zm.approxEqAbs(right, zm.f32x4(1.0, 0.0, 0.0, 0), 0.01)); 134 + const rot_yaw = zm.rotationY(degToRad(f32, 90)); 135 + identity = zm.mul(identity, rot_yaw); 136 + right = getAxisX(identity); 137 + try expect(zm.approxEqAbs(right, zm.f32x4(0.0, 0.0, -1.0, 0), 0.01)); 138 + const rot_pitch = zm.rotationX(degToRad(f32, 90)); 139 + identity = zm.mul(identity, rot_pitch); 140 + right = getAxisX(identity); 141 + try expect(zm.approxEqAbs(right, zm.f32x4(0.0, 1.0, 0.0, 0), 0.01)); 142 + } 143 + 144 + // ------------------------------------------------------------------------------ 145 + // This software is available under 2 licenses -- choose whichever you prefer. 146 + // ------------------------------------------------------------------------------ 147 + // ALTERNATIVE A - MIT License 148 + // Copyright (c) 2022 Michal Ziulek and Contributors 149 + // Permission is hereby granted, free of charge, to any person obtaining identity copy of 150 + // this software and associated documentation files (the "Software"), to deal in 151 + // the Software without restriction, including without limitation the rights to 152 + // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 153 + // of the Software, and to permit persons to whom the Software is furnished to do 154 + // so, subject to the following conditions: 155 + // The above copyright notice and this permission notice shall be included in all 156 + // copies or substantial portions of the Software. 157 + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 158 + // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 159 + // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 160 + // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 161 + // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 162 + // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 163 + // SOFTWARE. 164 + // ------------------------------------------------------------------------------ 165 + // ALTERNATIVE B - Public Domain (www.unlicense.org) 166 + // This is free and unencumbered software released into the public domain. 167 + // Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 168 + // software, either in source code form or as identity compiled binary, for any purpose, 169 + // commercial or non-commercial, and by any means. 170 + // In jurisdictions that recognize copyright laws, the author or authors of this 171 + // software dedicate any and all copyright interest in the software to the public 172 + // domain. We make this dedication for the benefit of the public at large and to 173 + // the detriment of our heirs and successors. We intend this dedication to be an 174 + // overt act of relinquishment in perpetuity of all present and future rights to 175 + // this software under copyright law. 176 + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 177 + // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 178 + // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 179 + // AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 180 + // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 181 + // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 182 + // ------------------------------------------------------------------------------

+4554

libs/zmath/src/zmath.zig

··· 1 + // ============================================================================== 2 + // 3 + // SIMD math library for game developers 4 + // https://github.com/michal-z/zig-gamedev/tree/main/libs/zmath 5 + // 6 + // Should work on all OSes supported by Zig. Works on x86_64 and ARM. 7 + // Provides ~140 optimized routines and ~70 extensive tests. 8 + // Can be used with any graphics API. 9 + // 10 + // zmath uses row-major matrices, row vectors (each row vector is stored in a SIMD register). 11 + // Handedness is determined by which function version is used (Rh vs. Lh), 12 + // otherwise the function works with either left-handed or right-handed view coordinates. 13 + // 14 + // const va = f32x4(1.0, 2.0, 3.0, 1.0); 15 + // const vb = f32x4(-1.0, 1.0, -1.0, 1.0); 16 + // const v0 = va + vb - f32x4(0.0, 1.0, 0.0, 1.0) * f32x4s(3.0); 17 + // const v1 = cross3(va, vb) + f32x4(1.0, 1.0, 1.0, 1.0); 18 + // const v2 = va + dot3(va, vb) / v1; // dotN() returns scalar replicated on all vector components 19 + // 20 + // const m = rotationX(math.pi * 0.25); 21 + // const v = f32x4(...); 22 + // const v0 = mul(v, m); // 'v' treated as a row vector 23 + // const v1 = mul(m, v); // 'v' treated as a column vector 24 + // const f = m[row][column]; 25 + // 26 + // const b = va < vb; 27 + // if (all(b, 0)) { ... } // '0' means check all vector components; if all are 'true' 28 + // if (all(b, 3)) { ... } // '3' means check first three vector components; if all first three are 'true' 29 + // if (any(b, 0)) { ... } // '0' means check all vector components; if any is 'true' 30 + // if (any(b, 3)) { ... } // '3' means check first three vector components; if any from first three is 'true' 31 + // 32 + // var v4 = load(mem[0..], F32x4, 0); 33 + // var v8 = load(mem[100..], F32x8, 0); 34 + // var v16 = load(mem[200..], F32x16, 0); 35 + // 36 + // var camera_position = [3]f32{ 1.0, 2.0, 3.0 }; 37 + // var cam_pos = loadArr3(camera_position); 38 + // ... 39 + // storeArr3(&camera_position, cam_pos); 40 + // 41 + // v4 = sin(v4); // SIMDx4 42 + // v8 = cos(v8); // .x86_64 -> 2 x SIMDx4, .x86_64+avx+fma -> SIMDx8 43 + // v16 = atan(v16); // .x86_64 -> 4 x SIMDx4, .x86_64+avx+fma -> 2 x SIMDx8, .x86_64+avx512f -> SIMDx16 44 + // 45 + // store(mem[0..], v4, 0); 46 + // store(mem[100..], v8, 0); 47 + // store(mem[200..], v16, 0); 48 + // 49 + // ------------------------------------------------------------------------------ 50 + // 1. Initialization functions 51 + // ------------------------------------------------------------------------------ 52 + // 53 + // f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 54 + // f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 55 + // f32x16(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, 56 + // e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 57 + // 58 + // f32x4s(e0: f32) F32x4 59 + // f32x8s(e0: f32) F32x8 60 + // f32x16s(e0: f32) F32x16 61 + // 62 + // boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 63 + // boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 64 + // boolx16(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, 65 + // e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 66 + // 67 + // load(mem: []const f32, comptime T: type, comptime len: u32) T 68 + // store(mem: []f32, v: anytype, comptime len: u32) void 69 + // 70 + // loadArr2(arr: [2]f32) F32x4 71 + // loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 72 + // loadArr3(arr: [3]f32) F32x4 73 + // loadArr3w(arr: [3]f32, w: f32) F32x4 74 + // loadArr4(arr: [4]f32) F32x4 75 + // 76 + // storeArr2(arr: *[2]f32, v: F32x4) void 77 + // storeArr3(arr: *[3]f32, v: F32x4) void 78 + // storeArr4(arr: *[4]f32, v: F32x4) void 79 + // 80 + // arr3Ptr(ptr: anytype) *const [3]f32 81 + // arrNPtr(ptr: anytype) [*]const f32 82 + // 83 + // splat(comptime T: type, value: f32) T 84 + // splatInt(comptime T: type, value: u32) T 85 + // 86 + // ------------------------------------------------------------------------------ 87 + // 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) 88 + // ------------------------------------------------------------------------------ 89 + // 90 + // all(vb: anytype, comptime len: u32) bool 91 + // any(vb: anytype, comptime len: u32) bool 92 + // 93 + // isNearEqual(v0: F32xN, v1: F32xN, epsilon: F32xN) BoolxN 94 + // isNan(v: F32xN) BoolxN 95 + // isInf(v: F32xN) BoolxN 96 + // isInBounds(v: F32xN, bounds: F32xN) BoolxN 97 + // 98 + // andInt(v0: F32xN, v1: F32xN) F32xN 99 + // andNotInt(v0: F32xN, v1: F32xN) F32xN 100 + // orInt(v0: F32xN, v1: F32xN) F32xN 101 + // norInt(v0: F32xN, v1: F32xN) F32xN 102 + // xorInt(v0: F32xN, v1: F32xN) F32xN 103 + // 104 + // minFast(v0: F32xN, v1: F32xN) F32xN 105 + // maxFast(v0: F32xN, v1: F32xN) F32xN 106 + // min(v0: F32xN, v1: F32xN) F32xN 107 + // max(v0: F32xN, v1: F32xN) F32xN 108 + // round(v: F32xN) F32xN 109 + // floor(v: F32xN) F32xN 110 + // trunc(v: F32xN) F32xN 111 + // ceil(v: F32xN) F32xN 112 + // clamp(v0: F32xN, v1: F32xN) F32xN 113 + // clampFast(v0: F32xN, v1: F32xN) F32xN 114 + // saturate(v: F32xN) F32xN 115 + // saturateFast(v: F32xN) F32xN 116 + // lerp(v0: F32xN, v1: F32xN, t: f32) F32xN 117 + // lerpV(v0: F32xN, v1: F32xN, t: F32xN) F32xN 118 + // lerpInverse(v0: F32xN, v1: F32xN, t: f32) F32xN 119 + // lerpInverseV(v0: F32xN, v1: F32xN, t: F32xN) F32xN 120 + // mapLinear(v: F32xN, min1: f32, max1: f32, min2: f32, max2: f32) F32xN 121 + // mapLinearV(v: F32xN, min1: F32xN, max1: F32xN, min2: F32xN, max2: F32xN) F32xN 122 + // sqrt(v: F32xN) F32xN 123 + // abs(v: F32xN) F32xN 124 + // mod(v0: F32xN, v1: F32xN) F32xN 125 + // modAngle(v: F32xN) F32xN 126 + // mulAdd(v0: F32xN, v1: F32xN, v2: F32xN) F32xN 127 + // select(mask: BoolxN, v0: F32xN, v1: F32xN) 128 + // sin(v: F32xN) F32xN 129 + // cos(v: F32xN) F32xN 130 + // sincos(v: F32xN) [2]F32xN 131 + // asin(v: F32xN) F32xN 132 + // acos(v: F32xN) F32xN 133 + // atan(v: F32xN) F32xN 134 + // atan2(vy: F32xN, vx: F32xN) F32xN 135 + // cmulSoa(re0: F32xN, im0: F32xN, re1: F32xN, im1: F32xN) [2]F32xN 136 + // 137 + // ------------------------------------------------------------------------------ 138 + // 3. 2D, 3D, 4D vector functions 139 + // ------------------------------------------------------------------------------ 140 + // 141 + // swizzle(v: Vec, c, c, c, c) Vec (comptime c = .x | .y | .z | .w) 142 + // dot2(v0: Vec, v1: Vec) F32x4 143 + // dot3(v0: Vec, v1: Vec) F32x4 144 + // dot4(v0: Vec, v1: Vec) F32x4 145 + // cross3(v0: Vec, v1: Vec) Vec 146 + // lengthSq2(v: Vec) F32x4 147 + // lengthSq3(v: Vec) F32x4 148 + // lengthSq4(v: Vec) F32x4 149 + // length2(v: Vec) F32x4 150 + // length3(v: Vec) F32x4 151 + // length4(v: Vec) F32x4 152 + // normalize2(v: Vec) Vec 153 + // normalize3(v: Vec) Vec 154 + // normalize4(v: Vec) Vec 155 + // 156 + // vecToArr2(v: Vec) [2]f32 157 + // vecToArr3(v: Vec) [3]f32 158 + // vecToArr4(v: Vec) [4]f32 159 + // 160 + // ------------------------------------------------------------------------------ 161 + // 4. Matrix functions 162 + // ------------------------------------------------------------------------------ 163 + // 164 + // identity() Mat 165 + // mul(m0: Mat, m1: Mat) Mat 166 + // mul(s: f32, m: Mat) Mat 167 + // mul(m: Mat, s: f32) Mat 168 + // mul(v: Vec, m: Mat) Vec 169 + // mul(m: Mat, v: Vec) Vec 170 + // transpose(m: Mat) Mat 171 + // rotationX(angle: f32) Mat 172 + // rotationY(angle: f32) Mat 173 + // rotationZ(angle: f32) Mat 174 + // translation(x: f32, y: f32, z: f32) Mat 175 + // translationV(v: Vec) Mat 176 + // scaling(x: f32, y: f32, z: f32) Mat 177 + // scalingV(v: Vec) Mat 178 + // lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat 179 + // lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat 180 + // lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat 181 + // lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat 182 + // perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat 183 + // perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat 184 + // perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat 185 + // perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat 186 + // orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat 187 + // orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat 188 + // orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat 189 + // orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat 190 + // orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 191 + // orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 192 + // orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 193 + // orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat 194 + // determinant(m: Mat) F32x4 195 + // inverse(m: Mat) Mat 196 + // inverseDet(m: Mat, det: ?*F32x4) Mat 197 + // matToQuat(m: Mat) Quat 198 + // matFromAxisAngle(axis: Vec, angle: f32) Mat 199 + // matFromNormAxisAngle(axis: Vec, angle: f32) Mat 200 + // matFromQuat(quat: Quat) Mat 201 + // matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat 202 + // matFromRollPitchYawV(angles: Vec) Mat 203 + // matFromArr(arr: [16]f32) Mat 204 + // 205 + // loadMat(mem: []const f32) Mat 206 + // loadMat43(mem: []const f32) Mat 207 + // loadMat34(mem: []const f32) Mat 208 + // storeMat(mem: []f32, m: Mat) void 209 + // storeMat43(mem: []f32, m: Mat) void 210 + // storeMat34(mem: []f32, m: Mat) void 211 + // 212 + // matToArr(m: Mat) [16]f32 213 + // matToArr43(m: Mat) [12]f32 214 + // matToArr34(m: Mat) [12]f32 215 + // 216 + // ------------------------------------------------------------------------------ 217 + // 5. Quaternion functions 218 + // ------------------------------------------------------------------------------ 219 + // 220 + // qmul(q0: Quat, q1: Quat) Quat 221 + // qidentity() Quat 222 + // conjugate(quat: Quat) Quat 223 + // inverse(q: Quat) Quat 224 + // rotate(q: Quat, v: Vec) Vec 225 + // slerp(q0: Quat, q1: Quat, t: f32) Quat 226 + // slerpV(q0: Quat, q1: Quat, t: F32x4) Quat 227 + // quatToMat(quat: Quat) Mat 228 + // quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void 229 + // quatFromMat(m: Mat) Quat 230 + // quatFromAxisAngle(axis: Vec, angle: f32) Quat 231 + // quatFromNormAxisAngle(axis: Vec, angle: f32) Quat 232 + // quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat 233 + // quatFromRollPitchYawV(angles: Vec) Quat 234 + // 235 + // ------------------------------------------------------------------------------ 236 + // 6. Color functions 237 + // ------------------------------------------------------------------------------ 238 + // 239 + // adjustSaturation(color: F32x4, saturation: f32) F32x4 240 + // adjustContrast(color: F32x4, contrast: f32) F32x4 241 + // rgbToHsl(rgb: F32x4) F32x4 242 + // hslToRgb(hsl: F32x4) F32x4 243 + // rgbToHsv(rgb: F32x4) F32x4 244 + // hsvToRgb(hsv: F32x4) F32x4 245 + // rgbToSrgb(rgb: F32x4) F32x4 246 + // srgbToRgb(srgb: F32x4) F32x4 247 + // 248 + // ------------------------------------------------------------------------------ 249 + // X. Misc functions 250 + // ------------------------------------------------------------------------------ 251 + // 252 + // linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 253 + // sin(v: f32) f32 254 + // cos(v: f32) f32 255 + // sincos(v: f32) [2]f32 256 + // asin(v: f32) f32 257 + // acos(v: f32) f32 258 + // 259 + // fftInitUnityTable(unitytable: []F32x4) void 260 + // fft(re: []F32x4, im: []F32x4, unitytable: []const F32x4) void 261 + // ifft(re: []F32x4, im: []const F32x4, unitytable: []const F32x4) void 262 + // 263 + // ============================================================================== 264 + 265 + // Fundamental types 266 + pub const F32x4 = @Vector(4, f32); 267 + pub const F32x8 = @Vector(8, f32); 268 + pub const F32x16 = @Vector(16, f32); 269 + pub const Boolx4 = @Vector(4, bool); 270 + pub const Boolx8 = @Vector(8, bool); 271 + pub const Boolx16 = @Vector(16, bool); 272 + 273 + // "Higher-level" aliases 274 + pub const Vec = F32x4; 275 + pub const Mat = [4]F32x4; 276 + pub const Quat = F32x4; 277 + 278 + const builtin = @import("builtin"); 279 + const std = @import("std"); 280 + const math = std.math; 281 + const assert = std.debug.assert; 282 + const expect = std.testing.expect; 283 + 284 + const cpu_arch = builtin.cpu.arch; 285 + const has_avx = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx) else false; 286 + const has_avx512f = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .avx512f) else false; 287 + const has_fma = if (cpu_arch == .x86_64) std.Target.x86.featureSetHas(builtin.cpu.features, .fma) else false; 288 + // ------------------------------------------------------------------------------ 289 + // 290 + // 1. Initialization functions 291 + // 292 + // ------------------------------------------------------------------------------ 293 + pub inline fn f32x4(e0: f32, e1: f32, e2: f32, e3: f32) F32x4 { 294 + return .{ e0, e1, e2, e3 }; 295 + } 296 + pub inline fn f32x8(e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32) F32x8 { 297 + return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; 298 + } 299 + // zig fmt: off 300 + pub inline fn f32x16( 301 + e0: f32, e1: f32, e2: f32, e3: f32, e4: f32, e5: f32, e6: f32, e7: f32, 302 + e8: f32, e9: f32, ea: f32, eb: f32, ec: f32, ed: f32, ee: f32, ef: f32) F32x16 { 303 + return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; 304 + } 305 + // zig fmt: on 306 + 307 + pub inline fn f32x4s(e0: f32) F32x4 { 308 + return splat(F32x4, e0); 309 + } 310 + pub inline fn f32x8s(e0: f32) F32x8 { 311 + return splat(F32x8, e0); 312 + } 313 + pub inline fn f32x16s(e0: f32) F32x16 { 314 + return splat(F32x16, e0); 315 + } 316 + 317 + pub inline fn boolx4(e0: bool, e1: bool, e2: bool, e3: bool) Boolx4 { 318 + return .{ e0, e1, e2, e3 }; 319 + } 320 + pub inline fn boolx8(e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool) Boolx8 { 321 + return .{ e0, e1, e2, e3, e4, e5, e6, e7 }; 322 + } 323 + // zig fmt: off 324 + pub inline fn boolx16( 325 + e0: bool, e1: bool, e2: bool, e3: bool, e4: bool, e5: bool, e6: bool, e7: bool, 326 + e8: bool, e9: bool, ea: bool, eb: bool, ec: bool, ed: bool, ee: bool, ef: bool) Boolx16 { 327 + return .{ e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, ea, eb, ec, ed, ee, ef }; 328 + } 329 + // zig fmt: on 330 + 331 + pub inline fn veclen(comptime T: type) comptime_int { 332 + return @typeInfo(T).Vector.len; 333 + } 334 + 335 + pub inline fn splat(comptime T: type, value: f32) T { 336 + return @splat(value); 337 + } 338 + pub inline fn splatInt(comptime T: type, value: u32) T { 339 + return @splat(@bitCast(value)); 340 + } 341 + 342 + pub fn load(mem: []const f32, comptime T: type, comptime len: u32) T { 343 + var v = splat(T, 0.0); 344 + const loop_len = if (len == 0) veclen(T) else len; 345 + comptime var i: u32 = 0; 346 + inline while (i < loop_len) : (i += 1) { 347 + v[i] = mem[i]; 348 + } 349 + return v; 350 + } 351 + test "zmath.load" { 352 + const a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; 353 + var ptr = &a; 354 + var i: u32 = 0; 355 + const v0 = load(a[i..], F32x4, 2); 356 + try expect(approxEqAbs(v0, F32x4{ 1.0, 2.0, 0.0, 0.0 }, 0.0)); 357 + i += 2; 358 + const v1 = load(a[i .. i + 2], F32x4, 2); 359 + try expect(approxEqAbs(v1, F32x4{ 3.0, 4.0, 0.0, 0.0 }, 0.0)); 360 + const v2 = load(a[5..7], F32x4, 2); 361 + try expect(approxEqAbs(v2, F32x4{ 6.0, 7.0, 0.0, 0.0 }, 0.0)); 362 + const v3 = load(ptr[1..], F32x4, 2); 363 + try expect(approxEqAbs(v3, F32x4{ 2.0, 3.0, 0.0, 0.0 }, 0.0)); 364 + i += 1; 365 + const v4 = load(ptr[i .. i + 2], F32x4, 2); 366 + try expect(approxEqAbs(v4, F32x4{ 4.0, 5.0, 0.0, 0.0 }, 0.0)); 367 + } 368 + 369 + pub fn store(mem: []f32, v: anytype, comptime len: u32) void { 370 + const T = @TypeOf(v); 371 + const loop_len = if (len == 0) veclen(T) else len; 372 + comptime var i: u32 = 0; 373 + inline while (i < loop_len) : (i += 1) { 374 + mem[i] = v[i]; 375 + } 376 + } 377 + test "zmath.store" { 378 + var a = [7]f32{ 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 }; 379 + const v = load(a[1..], F32x4, 3); 380 + store(a[2..], v, 4); 381 + try expect(a[0] == 1.0); 382 + try expect(a[1] == 2.0); 383 + try expect(a[2] == 2.0); 384 + try expect(a[3] == 3.0); 385 + try expect(a[4] == 4.0); 386 + try expect(a[5] == 0.0); 387 + } 388 + 389 + pub inline fn loadArr2(arr: [2]f32) F32x4 { 390 + return f32x4(arr[0], arr[1], 0.0, 0.0); 391 + } 392 + pub inline fn loadArr2zw(arr: [2]f32, z: f32, w: f32) F32x4 { 393 + return f32x4(arr[0], arr[1], z, w); 394 + } 395 + pub inline fn loadArr3(arr: [3]f32) F32x4 { 396 + return f32x4(arr[0], arr[1], arr[2], 0.0); 397 + } 398 + pub inline fn loadArr3w(arr: [3]f32, w: f32) F32x4 { 399 + return f32x4(arr[0], arr[1], arr[2], w); 400 + } 401 + pub inline fn loadArr4(arr: [4]f32) F32x4 { 402 + return f32x4(arr[0], arr[1], arr[2], arr[3]); 403 + } 404 + 405 + pub inline fn storeArr2(arr: *[2]f32, v: F32x4) void { 406 + arr.* = .{ v[0], v[1] }; 407 + } 408 + pub inline fn storeArr3(arr: *[3]f32, v: F32x4) void { 409 + arr.* = .{ v[0], v[1], v[2] }; 410 + } 411 + pub inline fn storeArr4(arr: *[4]f32, v: F32x4) void { 412 + arr.* = .{ v[0], v[1], v[2], v[3] }; 413 + } 414 + 415 + pub inline fn arr3Ptr(ptr: anytype) *const [3]f32 { 416 + comptime assert(@typeInfo(@TypeOf(ptr)) == .Pointer); 417 + const T = std.meta.Child(@TypeOf(ptr)); 418 + comptime assert(T == F32x4); 419 + return @as(*const [3]f32, @ptrCast(ptr)); 420 + } 421 + 422 + pub inline fn arrNPtr(ptr: anytype) [*]const f32 { 423 + comptime assert(@typeInfo(@TypeOf(ptr)) == .Pointer); 424 + const T = std.meta.Child(@TypeOf(ptr)); 425 + comptime assert(T == Mat or T == F32x4 or T == F32x8 or T == F32x16); 426 + return @as([*]const f32, @ptrCast(ptr)); 427 + } 428 + test "zmath.arrNPtr" { 429 + { 430 + const mat = identity(); 431 + const f32ptr = arrNPtr(&mat); 432 + try expect(f32ptr[0] == 1.0); 433 + try expect(f32ptr[5] == 1.0); 434 + try expect(f32ptr[10] == 1.0); 435 + try expect(f32ptr[15] == 1.0); 436 + } 437 + { 438 + const v8 = f32x8s(1.0); 439 + const f32ptr = arrNPtr(&v8); 440 + try expect(f32ptr[1] == 1.0); 441 + try expect(f32ptr[7] == 1.0); 442 + } 443 + } 444 + 445 + test "zmath.loadArr" { 446 + { 447 + const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; 448 + const simd_reg = loadArr3(camera_position); 449 + try expect(approxEqAbs(simd_reg, f32x4(1.0, 2.0, 3.0, 0.0), 0.0)); 450 + } 451 + { 452 + const camera_position = [3]f32{ 1.0, 2.0, 3.0 }; 453 + const simd_reg = loadArr3w(camera_position, 1.0); 454 + try expect(approxEqAbs(simd_reg, f32x4(1.0, 2.0, 3.0, 1.0), 0.0)); 455 + } 456 + } 457 + 458 + pub inline fn vecToArr2(v: Vec) [2]f32 { 459 + return .{ v[0], v[1] }; 460 + } 461 + pub inline fn vecToArr3(v: Vec) [3]f32 { 462 + return .{ v[0], v[1], v[2] }; 463 + } 464 + pub inline fn vecToArr4(v: Vec) [4]f32 { 465 + return .{ v[0], v[1], v[2], v[3] }; 466 + } 467 + // ------------------------------------------------------------------------------ 468 + // 469 + // 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) 470 + // 471 + // ------------------------------------------------------------------------------ 472 + pub fn all(vb: anytype, comptime len: u32) bool { 473 + const T = @TypeOf(vb); 474 + if (len > veclen(T)) { 475 + @compileError("zmath.all(): 'len' is greater than vector len of type " ++ @typeName(T)); 476 + } 477 + const loop_len = if (len == 0) veclen(T) else len; 478 + const ab: [veclen(T)]bool = vb; 479 + comptime var i: u32 = 0; 480 + var result = true; 481 + inline while (i < loop_len) : (i += 1) { 482 + result = result and ab[i]; 483 + } 484 + return result; 485 + } 486 + test "zmath.all" { 487 + try expect(all(boolx8(true, true, true, true, true, false, true, false), 5) == true); 488 + try expect(all(boolx8(true, true, true, true, true, false, true, false), 6) == false); 489 + try expect(all(boolx8(true, true, true, true, false, false, false, false), 4) == true); 490 + try expect(all(boolx4(true, true, true, false), 3) == true); 491 + try expect(all(boolx4(true, true, true, false), 1) == true); 492 + try expect(all(boolx4(true, false, false, false), 1) == true); 493 + try expect(all(boolx4(false, true, false, false), 1) == false); 494 + try expect(all(boolx8(true, true, true, true, true, false, true, false), 0) == false); 495 + try expect(all(boolx4(false, true, false, false), 0) == false); 496 + try expect(all(boolx4(true, true, true, true), 0) == true); 497 + } 498 + 499 + pub fn any(vb: anytype, comptime len: u32) bool { 500 + const T = @TypeOf(vb); 501 + if (len > veclen(T)) { 502 + @compileError("zmath.any(): 'len' is greater than vector len of type " ++ @typeName(T)); 503 + } 504 + const loop_len = if (len == 0) veclen(T) else len; 505 + const ab: [veclen(T)]bool = vb; 506 + comptime var i: u32 = 0; 507 + var result = false; 508 + inline while (i < loop_len) : (i += 1) { 509 + result = result or ab[i]; 510 + } 511 + return result; 512 + } 513 + test "zmath.any" { 514 + try expect(any(boolx8(true, true, true, true, true, false, true, false), 0) == true); 515 + try expect(any(boolx8(false, false, false, true, true, false, true, false), 3) == false); 516 + try expect(any(boolx8(false, false, false, false, false, true, false, false), 4) == false); 517 + } 518 + 519 + pub inline fn isNearEqual( 520 + v0: anytype, 521 + v1: anytype, 522 + epsilon: anytype, 523 + ) @Vector(veclen(@TypeOf(v0)), bool) { 524 + const T = @TypeOf(v0, v1, epsilon); 525 + const delta = v0 - v1; 526 + const temp = maxFast(delta, splat(T, 0.0) - delta); 527 + return temp <= epsilon; 528 + } 529 + test "zmath.isNearEqual" { 530 + { 531 + const v0 = f32x4(1.0, 2.0, -3.0, 4.001); 532 + const v1 = f32x4(1.0, 2.1, 3.0, 4.0); 533 + const b = isNearEqual(v0, v1, splat(F32x4, 0.01)); 534 + try expect(@reduce(.And, b == boolx4(true, false, false, true))); 535 + } 536 + { 537 + const v0 = f32x8(1.0, 2.0, -3.0, 4.001, 1.001, 2.3, -0.0, 0.0); 538 + const v1 = f32x8(1.0, 2.1, 3.0, 4.0, -1.001, 2.1, 0.0, 0.0); 539 + const b = isNearEqual(v0, v1, splat(F32x8, 0.01)); 540 + try expect(@reduce(.And, b == boolx8(true, false, false, true, false, false, true, true))); 541 + } 542 + try expect(all(isNearEqual( 543 + splat(F32x4, math.inf(f32)), 544 + splat(F32x4, math.inf(f32)), 545 + splat(F32x4, 0.0001), 546 + ), 0) == false); 547 + try expect(all(isNearEqual( 548 + splat(F32x4, -math.inf(f32)), 549 + splat(F32x4, math.inf(f32)), 550 + splat(F32x4, 0.0001), 551 + ), 0) == false); 552 + try expect(all(isNearEqual( 553 + splat(F32x4, -math.inf(f32)), 554 + splat(F32x4, -math.inf(f32)), 555 + splat(F32x4, 0.0001), 556 + ), 0) == false); 557 + try expect(all(isNearEqual( 558 + splat(F32x4, -math.nan(f32)), 559 + splat(F32x4, math.inf(f32)), 560 + splat(F32x4, 0.0001), 561 + ), 0) == false); 562 + } 563 + 564 + pub inline fn isNan( 565 + v: anytype, 566 + ) @Vector(veclen(@TypeOf(v)), bool) { 567 + return v != v; 568 + } 569 + test "zmath.isNan" { 570 + { 571 + const v0 = f32x4(math.inf(f32), math.nan(f32), math.nan(f32), 7.0); 572 + const b = isNan(v0); 573 + try expect(@reduce(.And, b == boolx4(false, true, true, false))); 574 + } 575 + { 576 + const v0 = f32x8(0, math.nan(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); 577 + const b = isNan(v0); 578 + try expect(@reduce(.And, b == boolx8(false, true, false, false, false, true, true, false))); 579 + } 580 + } 581 + 582 + pub inline fn isInf( 583 + v: anytype, 584 + ) @Vector(veclen(@TypeOf(v)), bool) { 585 + const T = @TypeOf(v); 586 + return abs(v) == splat(T, math.inf(f32)); 587 + } 588 + test "zmath.isInf" { 589 + { 590 + const v0 = f32x4(math.inf(f32), math.nan(f32), math.snan(f32), 7.0); 591 + const b = isInf(v0); 592 + try expect(@reduce(.And, b == boolx4(true, false, false, false))); 593 + } 594 + { 595 + const v0 = f32x8(0, math.inf(f32), 0, 0, math.inf(f32), math.nan(f32), math.snan(f32), 7.0); 596 + const b = isInf(v0); 597 + try expect(@reduce(.And, b == boolx8(false, true, false, false, true, false, false, false))); 598 + } 599 + } 600 + 601 + pub inline fn isInBounds( 602 + v: anytype, 603 + bounds: anytype, 604 + ) @Vector(veclen(@TypeOf(v)), bool) { 605 + const T = @TypeOf(v, bounds); 606 + const Tu = @Vector(veclen(T), u1); 607 + const Tr = @Vector(veclen(T), bool); 608 + 609 + // 2 x cmpleps, xorps, load, andps 610 + const b0 = v <= bounds; 611 + const b1 = (bounds * splat(T, -1.0)) <= v; 612 + const b0u = @as(Tu, @bitCast(b0)); 613 + const b1u = @as(Tu, @bitCast(b1)); 614 + return @as(Tr, @bitCast(b0u & b1u)); 615 + } 616 + test "zmath.isInBounds" { 617 + { 618 + const v0 = f32x4(0.5, -2.0, -1.0, 1.9); 619 + const v1 = f32x4(-1.6, -2.001, -1.0, 1.9); 620 + const bounds = f32x4(1.0, 2.0, 1.0, 2.0); 621 + const b0 = isInBounds(v0, bounds); 622 + const b1 = isInBounds(v1, bounds); 623 + try expect(@reduce(.And, b0 == boolx4(true, true, true, true))); 624 + try expect(@reduce(.And, b1 == boolx4(false, false, true, true))); 625 + } 626 + { 627 + const v0 = f32x8(2.0, 1.0, 2.0, 1.0, 0.5, -2.0, -1.0, 1.9); 628 + const bounds = f32x8(1.0, 1.0, 1.0, math.inf(f32), 1.0, math.nan(f32), 1.0, 2.0); 629 + const b0 = isInBounds(v0, bounds); 630 + try expect(@reduce(.And, b0 == boolx8(false, true, false, true, true, false, true, true))); 631 + } 632 + } 633 + 634 + pub inline fn andInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 635 + const T = @TypeOf(v0, v1); 636 + const Tu = @Vector(veclen(T), u32); 637 + const v0u = @as(Tu, @bitCast(v0)); 638 + const v1u = @as(Tu, @bitCast(v1)); 639 + return @as(T, @bitCast(v0u & v1u)); // andps 640 + } 641 + test "zmath.andInt" { 642 + { 643 + const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 644 + const v1 = f32x4(1.0, 2.0, 3.0, math.inf(f32)); 645 + const v = andInt(v0, v1); 646 + try expect(v[3] == math.inf(f32)); 647 + try expect(approxEqAbs(v, f32x4(0.0, 2.0, 0.0, math.inf(f32)), 0.0)); 648 + } 649 + { 650 + const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 651 + const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, math.inf(f32)); 652 + const v = andInt(v0, v1); 653 + try expect(v[7] == math.inf(f32)); 654 + try expect(approxEqAbs(v, f32x8(0, 0, 0, 0, 0.0, 2.0, 0.0, math.inf(f32)), 0.0)); 655 + } 656 + } 657 + 658 + pub inline fn andNotInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 659 + const T = @TypeOf(v0, v1); 660 + const Tu = @Vector(veclen(T), u32); 661 + const v0u = @as(Tu, @bitCast(v0)); 662 + const v1u = @as(Tu, @bitCast(v1)); 663 + return @as(T, @bitCast(~v0u & v1u)); // andnps 664 + } 665 + test "zmath.andNotInt" { 666 + { 667 + const v0 = f32x4(1.0, 2.0, 3.0, 4.0); 668 + const v1 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 669 + const v = andNotInt(v1, v0); 670 + try expect(approxEqAbs(v, f32x4(1.0, 0.0, 3.0, 0.0), 0.0)); 671 + } 672 + { 673 + const v0 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); 674 + const v1 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, @as(f32, @bitCast(~@as(u32, 0)))); 675 + const v = andNotInt(v1, v0); 676 + try expect(approxEqAbs(v, f32x8(0, 0, 0, 0, 1.0, 0.0, 3.0, 0.0), 0.0)); 677 + } 678 + } 679 + 680 + pub inline fn orInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 681 + const T = @TypeOf(v0, v1); 682 + const Tu = @Vector(veclen(T), u32); 683 + const v0u = @as(Tu, @bitCast(v0)); 684 + const v1u = @as(Tu, @bitCast(v1)); 685 + return @as(T, @bitCast(v0u | v1u)); // orps 686 + } 687 + test "zmath.orInt" { 688 + { 689 + const v0 = f32x4(0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 690 + const v1 = f32x4(1.0, 2.0, 3.0, 4.0); 691 + const v = orInt(v0, v1); 692 + try expect(v[0] == 1.0); 693 + try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); 694 + try expect(v[2] == 3.0); 695 + try expect(v[3] == 4.0); 696 + } 697 + { 698 + const v0 = f32x8(0, 0, 0, 0, 0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 699 + const v1 = f32x8(0, 0, 0, 0, 1.0, 2.0, 3.0, 4.0); 700 + const v = orInt(v0, v1); 701 + try expect(v[4] == 1.0); 702 + try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); 703 + try expect(v[6] == 3.0); 704 + try expect(v[7] == 4.0); 705 + } 706 + } 707 + 708 + pub inline fn norInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 709 + const T = @TypeOf(v0, v1); 710 + const Tu = @Vector(veclen(T), u32); 711 + const v0u = @as(Tu, @bitCast(v0)); 712 + const v1u = @as(Tu, @bitCast(v1)); 713 + return @as(T, @bitCast(~(v0u | v1u))); // por, pcmpeqd, pxor 714 + } 715 + 716 + pub inline fn xorInt(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 717 + const T = @TypeOf(v0, v1); 718 + const Tu = @Vector(veclen(T), u32); 719 + const v0u = @as(Tu, @bitCast(v0)); 720 + const v1u = @as(Tu, @bitCast(v1)); 721 + return @as(T, @bitCast(v0u ^ v1u)); // xorps 722 + } 723 + test "zmath.xorInt" { 724 + { 725 + const v0 = f32x4(1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 726 + const v1 = f32x4(1.0, 0, 0, 0); 727 + const v = xorInt(v0, v1); 728 + try expect(v[0] == 0.0); 729 + try expect(@as(u32, @bitCast(v[1])) == ~@as(u32, 0)); 730 + try expect(v[2] == 0.0); 731 + try expect(v[3] == 0.0); 732 + } 733 + { 734 + const v0 = f32x8(0, 0, 0, 0, 1.0, @as(f32, @bitCast(~@as(u32, 0))), 0, 0); 735 + const v1 = f32x8(0, 0, 0, 0, 1.0, 0, 0, 0); 736 + const v = xorInt(v0, v1); 737 + try expect(v[4] == 0.0); 738 + try expect(@as(u32, @bitCast(v[5])) == ~@as(u32, 0)); 739 + try expect(v[6] == 0.0); 740 + try expect(v[7] == 0.0); 741 + } 742 + } 743 + 744 + pub inline fn minFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 745 + return select(v0 < v1, v0, v1); // minps 746 + } 747 + test "zmath.minFast" { 748 + { 749 + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 750 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 751 + const v = minFast(v0, v1); 752 + try expect(approxEqAbs(v, f32x4(1.0, 1.0, 2.0, 7.0), 0.0)); 753 + } 754 + { 755 + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 756 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 757 + const v = minFast(v0, v1); 758 + try expect(v[0] == 1.0); 759 + try expect(v[1] == 1.0); 760 + try expect(!math.isNan(v[1])); 761 + try expect(v[2] == 4.0); 762 + try expect(v[3] == math.inf(f32)); 763 + try expect(!math.isNan(v[3])); 764 + } 765 + } 766 + 767 + pub inline fn maxFast(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 768 + return select(v0 > v1, v0, v1); // maxps 769 + } 770 + test "zmath.maxFast" { 771 + { 772 + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 773 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 774 + const v = maxFast(v0, v1); 775 + try expect(approxEqAbs(v, f32x4(2.0, 3.0, 4.0, math.inf(f32)), 0.0)); 776 + } 777 + { 778 + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 779 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 780 + const v = maxFast(v0, v1); 781 + try expect(v[0] == 2.0); 782 + try expect(v[1] == 1.0); 783 + try expect(v[2] == 5.0); 784 + try expect(v[3] == math.inf(f32)); 785 + try expect(!math.isNan(v[3])); 786 + } 787 + } 788 + 789 + pub inline fn min(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 790 + // This will handle inf & nan 791 + return @min(v0, v1); // minps, cmpunordps, andps, andnps, orps 792 + } 793 + test "zmath.min" { 794 + // Calling math.inf causes test to fail! 795 + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 796 + { 797 + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 798 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 799 + const v = min(v0, v1); 800 + try expect(approxEqAbs(v, f32x4(1.0, 1.0, 2.0, 7.0), 0.0)); 801 + } 802 + { 803 + const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); 804 + const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); 805 + const v = min(v0, v1); 806 + try expect(approxEqAbs(v, f32x8(0.0, 0.0, -2.0, 0.0, 1.0, 1.0, 2.0, 7.0), 0.0)); 807 + } 808 + { 809 + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 810 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 811 + const v = min(v0, v1); 812 + try expect(v[0] == 1.0); 813 + try expect(v[1] == 1.0); 814 + try expect(!math.isNan(v[1])); 815 + try expect(v[2] == 4.0); 816 + try expect(v[3] == math.inf(f32)); 817 + try expect(!math.isNan(v[3])); 818 + } 819 + 820 + { 821 + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); 822 + const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); 823 + const v = min(v0, v1); 824 + try expect(v[0] == -math.inf(f32)); 825 + try expect(v[1] == -math.inf(f32)); 826 + try expect(v[2] == math.inf(f32)); 827 + try expect(!math.isNan(v[2])); 828 + try expect(math.isNan(v[3])); 829 + try expect(!math.isInf(v[3])); 830 + } 831 + } 832 + 833 + pub inline fn max(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 834 + // This will handle inf & nan 835 + return @max(v0, v1); // maxps, cmpunordps, andps, andnps, orps 836 + } 837 + test "zmath.max" { 838 + // Calling math.inf causes test to fail! 839 + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 840 + { 841 + const v0 = f32x4(1.0, 3.0, 2.0, 7.0); 842 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 843 + const v = max(v0, v1); 844 + try expect(approxEqAbs(v, f32x4(2.0, 3.0, 4.0, math.inf(f32)), 0.0)); 845 + } 846 + { 847 + const v0 = f32x8(0, 0, -2.0, 0, 1.0, 3.0, 2.0, 7.0); 848 + const v1 = f32x8(0, 1.0, 0, 0, 2.0, 1.0, 4.0, math.inf(f32)); 849 + const v = max(v0, v1); 850 + try expect(approxEqAbs(v, f32x8(0.0, 1.0, 0.0, 0.0, 2.0, 3.0, 4.0, math.inf(f32)), 0.0)); 851 + } 852 + { 853 + const v0 = f32x4(1.0, math.nan(f32), 5.0, math.snan(f32)); 854 + const v1 = f32x4(2.0, 1.0, 4.0, math.inf(f32)); 855 + const v = max(v0, v1); 856 + try expect(v[0] == 2.0); 857 + try expect(v[1] == 1.0); 858 + try expect(v[2] == 5.0); 859 + try expect(v[3] == math.inf(f32)); 860 + try expect(!math.isNan(v[3])); 861 + } 862 + { 863 + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.inf(f32), math.snan(f32)); 864 + const v1 = f32x4(math.snan(f32), -math.inf(f32), math.snan(f32), math.nan(f32)); 865 + const v = max(v0, v1); 866 + try expect(v[0] == -math.inf(f32)); 867 + try expect(v[1] == math.inf(f32)); 868 + try expect(v[2] == math.inf(f32)); 869 + try expect(!math.isNan(v[2])); 870 + try expect(math.isNan(v[3])); 871 + try expect(!math.isInf(v[3])); 872 + } 873 + } 874 + 875 + pub fn round(v: anytype) @TypeOf(v) { 876 + const T = @TypeOf(v); 877 + if (cpu_arch == .x86_64 and has_avx) { 878 + if (T == F32x4) { 879 + return asm ("vroundps $0, %%xmm0, %%xmm0" 880 + : [ret] "={xmm0}" (-> T), 881 + : [v] "{xmm0}" (v), 882 + ); 883 + } else if (T == F32x8) { 884 + return asm ("vroundps $0, %%ymm0, %%ymm0" 885 + : [ret] "={ymm0}" (-> T), 886 + : [v] "{ymm0}" (v), 887 + ); 888 + } else if (T == F32x16 and has_avx512f) { 889 + return asm ("vrndscaleps $0, %%zmm0, %%zmm0" 890 + : [ret] "={zmm0}" (-> T), 891 + : [v] "{zmm0}" (v), 892 + ); 893 + } else if (T == F32x16 and !has_avx512f) { 894 + const arr: [16]f32 = v; 895 + var ymm0 = @as(F32x8, arr[0..8].*); 896 + var ymm1 = @as(F32x8, arr[8..16].*); 897 + ymm0 = asm ("vroundps $0, %%ymm0, %%ymm0" 898 + : [ret] "={ymm0}" (-> F32x8), 899 + : [v] "{ymm0}" (ymm0), 900 + ); 901 + ymm1 = asm ("vroundps $0, %%ymm1, %%ymm1" 902 + : [ret] "={ymm1}" (-> F32x8), 903 + : [v] "{ymm1}" (ymm1), 904 + ); 905 + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 906 + } 907 + } else { 908 + const sign = andInt(v, splatNegativeZero(T)); 909 + const magic = orInt(splatNoFraction(T), sign); 910 + var r1 = v + magic; 911 + r1 = r1 - magic; 912 + const r2 = abs(v); 913 + const mask = r2 <= splatNoFraction(T); 914 + return select(mask, r1, v); 915 + } 916 + } 917 + test "zmath.round" { 918 + { 919 + try expect(all(round(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 920 + try expect(all(round(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 921 + try expect(all(isNan(round(splat(F32x4, math.nan(f32)))), 0)); 922 + try expect(all(isNan(round(splat(F32x4, -math.nan(f32)))), 0)); 923 + try expect(all(isNan(round(splat(F32x4, math.snan(f32)))), 0)); 924 + try expect(all(isNan(round(splat(F32x4, -math.snan(f32)))), 0)); 925 + } 926 + { 927 + const v = round(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 928 + try expect(approxEqAbs( 929 + v, 930 + f32x16(1.0, -1.0, -2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 6.0, 6.0, 8.0, 9.0, 10.0, 11.0, 13.0, 13.0), 931 + 0.0, 932 + )); 933 + } 934 + var v = round(f32x4(1.1, -1.1, -1.5, 1.5)); 935 + try expect(approxEqAbs(v, f32x4(1.0, -1.0, -2.0, 2.0), 0.0)); 936 + 937 + const v1 = f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32)); 938 + v = round(v1); 939 + try expect(v[3] == math.inf(f32)); 940 + try expect(approxEqAbs(v, f32x4(-10_000_000.1, -math.inf(f32), 10_000_001.5, math.inf(f32)), 0.0)); 941 + 942 + const v2 = f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32)); 943 + v = round(v2); 944 + try expect(math.isNan(v2[0])); 945 + try expect(math.isNan(v2[1])); 946 + try expect(math.isNan(v2[2])); 947 + try expect(v2[3] == -math.inf(f32)); 948 + 949 + const v3 = f32x4(1001.5, -201.499, -10000.99, -101.5); 950 + v = round(v3); 951 + try expect(approxEqAbs(v, f32x4(1002.0, -201.0, -10001.0, -102.0), 0.0)); 952 + 953 + const v4 = f32x4(-1_388_609.9, 1_388_609.5, 1_388_109.01, 2_388_609.5); 954 + v = round(v4); 955 + try expect(approxEqAbs(v, f32x4(-1_388_610.0, 1_388_610.0, 1_388_109.0, 2_388_610.0), 0.0)); 956 + 957 + var f: f32 = -100.0; 958 + var i: u32 = 0; 959 + while (i < 100) : (i += 1) { 960 + const vr = round(splat(F32x4, f)); 961 + const fr = @round(splat(F32x4, f)); 962 + const vr8 = round(splat(F32x8, f)); 963 + const fr8 = @round(splat(F32x8, f)); 964 + const vr16 = round(splat(F32x16, f)); 965 + const fr16 = @round(splat(F32x16, f)); 966 + try expect(approxEqAbs(vr, fr, 0.0)); 967 + try expect(approxEqAbs(vr8, fr8, 0.0)); 968 + try expect(approxEqAbs(vr16, fr16, 0.0)); 969 + f += 0.12345 * @as(f32, @floatFromInt(i)); 970 + } 971 + } 972 + 973 + pub fn trunc(v: anytype) @TypeOf(v) { 974 + const T = @TypeOf(v); 975 + if (cpu_arch == .x86_64 and has_avx) { 976 + if (T == F32x4) { 977 + return asm ("vroundps $3, %%xmm0, %%xmm0" 978 + : [ret] "={xmm0}" (-> T), 979 + : [v] "{xmm0}" (v), 980 + ); 981 + } else if (T == F32x8) { 982 + return asm ("vroundps $3, %%ymm0, %%ymm0" 983 + : [ret] "={ymm0}" (-> T), 984 + : [v] "{ymm0}" (v), 985 + ); 986 + } else if (T == F32x16 and has_avx512f) { 987 + return asm ("vrndscaleps $3, %%zmm0, %%zmm0" 988 + : [ret] "={zmm0}" (-> T), 989 + : [v] "{zmm0}" (v), 990 + ); 991 + } else if (T == F32x16 and !has_avx512f) { 992 + const arr: [16]f32 = v; 993 + var ymm0 = @as(F32x8, arr[0..8].*); 994 + var ymm1 = @as(F32x8, arr[8..16].*); 995 + ymm0 = asm ("vroundps $3, %%ymm0, %%ymm0" 996 + : [ret] "={ymm0}" (-> F32x8), 997 + : [v] "{ymm0}" (ymm0), 998 + ); 999 + ymm1 = asm ("vroundps $3, %%ymm1, %%ymm1" 1000 + : [ret] "={ymm1}" (-> F32x8), 1001 + : [v] "{ymm1}" (ymm1), 1002 + ); 1003 + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 1004 + } 1005 + } else { 1006 + const mask = abs(v) < splatNoFraction(T); 1007 + const result = floatToIntAndBack(v); 1008 + return select(mask, result, v); 1009 + } 1010 + } 1011 + test "zmath.trunc" { 1012 + { 1013 + try expect(all(trunc(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 1014 + try expect(all(trunc(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 1015 + try expect(all(isNan(trunc(splat(F32x4, math.nan(f32)))), 0)); 1016 + try expect(all(isNan(trunc(splat(F32x4, -math.nan(f32)))), 0)); 1017 + try expect(all(isNan(trunc(splat(F32x4, math.snan(f32)))), 0)); 1018 + try expect(all(isNan(trunc(splat(F32x4, -math.snan(f32)))), 0)); 1019 + } 1020 + { 1021 + const v = trunc(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 1022 + try expect(approxEqAbs( 1023 + v, 1024 + f32x16(1.0, -1.0, -1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), 1025 + 0.0, 1026 + )); 1027 + } 1028 + var v = trunc(f32x4(1.1, -1.1, -1.5, 1.5)); 1029 + try expect(approxEqAbs(v, f32x4(1.0, -1.0, -1.0, 1.0), 0.0)); 1030 + 1031 + v = trunc(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1032 + try expect(approxEqAbs(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)), 0.0)); 1033 + 1034 + v = trunc(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); 1035 + try expect(math.isNan(v[0])); 1036 + try expect(math.isNan(v[1])); 1037 + try expect(math.isNan(v[2])); 1038 + try expect(v[3] == -math.inf(f32)); 1039 + 1040 + v = trunc(f32x4(1000.5001, -201.499, -10000.99, 100.750001)); 1041 + try expect(approxEqAbs(v, f32x4(1000.0, -201.0, -10000.0, 100.0), 0.0)); 1042 + 1043 + v = trunc(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); 1044 + try expect(approxEqAbs(v, f32x4(-7_388_609.0, 7_388_609.0, 8_388_109.0, -8_388_509.0), 0.0)); 1045 + 1046 + var f: f32 = -100.0; 1047 + var i: u32 = 0; 1048 + while (i < 100) : (i += 1) { 1049 + const vr = trunc(splat(F32x4, f)); 1050 + const fr = @trunc(splat(F32x4, f)); 1051 + const vr8 = trunc(splat(F32x8, f)); 1052 + const fr8 = @trunc(splat(F32x8, f)); 1053 + const vr16 = trunc(splat(F32x16, f)); 1054 + const fr16 = @trunc(splat(F32x16, f)); 1055 + try expect(approxEqAbs(vr, fr, 0.0)); 1056 + try expect(approxEqAbs(vr8, fr8, 0.0)); 1057 + try expect(approxEqAbs(vr16, fr16, 0.0)); 1058 + f += 0.12345 * @as(f32, @floatFromInt(i)); 1059 + } 1060 + } 1061 + 1062 + pub fn floor(v: anytype) @TypeOf(v) { 1063 + const T = @TypeOf(v); 1064 + if (cpu_arch == .x86_64 and has_avx) { 1065 + if (T == F32x4) { 1066 + return asm ("vroundps $1, %%xmm0, %%xmm0" 1067 + : [ret] "={xmm0}" (-> T), 1068 + : [v] "{xmm0}" (v), 1069 + ); 1070 + } else if (T == F32x8) { 1071 + return asm ("vroundps $1, %%ymm0, %%ymm0" 1072 + : [ret] "={ymm0}" (-> T), 1073 + : [v] "{ymm0}" (v), 1074 + ); 1075 + } else if (T == F32x16 and has_avx512f) { 1076 + return asm ("vrndscaleps $1, %%zmm0, %%zmm0" 1077 + : [ret] "={zmm0}" (-> T), 1078 + : [v] "{zmm0}" (v), 1079 + ); 1080 + } else if (T == F32x16 and !has_avx512f) { 1081 + const arr: [16]f32 = v; 1082 + var ymm0 = @as(F32x8, arr[0..8].*); 1083 + var ymm1 = @as(F32x8, arr[8..16].*); 1084 + ymm0 = asm ("vroundps $1, %%ymm0, %%ymm0" 1085 + : [ret] "={ymm0}" (-> F32x8), 1086 + : [v] "{ymm0}" (ymm0), 1087 + ); 1088 + ymm1 = asm ("vroundps $1, %%ymm1, %%ymm1" 1089 + : [ret] "={ymm1}" (-> F32x8), 1090 + : [v] "{ymm1}" (ymm1), 1091 + ); 1092 + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 1093 + } 1094 + } else { 1095 + const mask = abs(v) < splatNoFraction(T); 1096 + var result = floatToIntAndBack(v); 1097 + const larger_mask = result > v; 1098 + const larger = select(larger_mask, splat(T, -1.0), splat(T, 0.0)); 1099 + result = result + larger; 1100 + return select(mask, result, v); 1101 + } 1102 + } 1103 + test "zmath.floor" { 1104 + { 1105 + try expect(all(floor(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 1106 + try expect(all(floor(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 1107 + try expect(all(isNan(floor(splat(F32x4, math.nan(f32)))), 0)); 1108 + try expect(all(isNan(floor(splat(F32x4, -math.nan(f32)))), 0)); 1109 + try expect(all(isNan(floor(splat(F32x4, math.snan(f32)))), 0)); 1110 + try expect(all(isNan(floor(splat(F32x4, -math.snan(f32)))), 0)); 1111 + } 1112 + { 1113 + const v = floor(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 1114 + try expect(approxEqAbs( 1115 + v, 1116 + f32x16(1.0, -2.0, -2.0, 1.0, 2.0, 2.0, 2.0, 4.0, 5.0, 6.0, 7.0, 8.0, 10.0, 11.0, 12.0, 13.0), 1117 + 0.0, 1118 + )); 1119 + } 1120 + var v = floor(f32x4(1.5, -1.5, -1.7, -2.1)); 1121 + try expect(approxEqAbs(v, f32x4(1.0, -2.0, -2.0, -3.0), 0.0)); 1122 + 1123 + v = floor(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1124 + try expect(approxEqAbs(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)), 0.0)); 1125 + 1126 + v = floor(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); 1127 + try expect(math.isNan(v[0])); 1128 + try expect(math.isNan(v[1])); 1129 + try expect(math.isNan(v[2])); 1130 + try expect(v[3] == -math.inf(f32)); 1131 + 1132 + v = floor(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); 1133 + try expect(approxEqAbs(v, f32x4(1000.0, -202.0, -10001.0, 100.0), 0.0)); 1134 + 1135 + v = floor(f32x4(-7_388_609.5, 7_388_609.1, 8_388_109.5, -8_388_509.5)); 1136 + try expect(approxEqAbs(v, f32x4(-7_388_610.0, 7_388_609.0, 8_388_109.0, -8_388_510.0), 0.0)); 1137 + 1138 + var f: f32 = -100.0; 1139 + var i: u32 = 0; 1140 + while (i < 100) : (i += 1) { 1141 + const vr = floor(splat(F32x4, f)); 1142 + const fr = @floor(splat(F32x4, f)); 1143 + const vr8 = floor(splat(F32x8, f)); 1144 + const fr8 = @floor(splat(F32x8, f)); 1145 + const vr16 = floor(splat(F32x16, f)); 1146 + const fr16 = @floor(splat(F32x16, f)); 1147 + try expect(approxEqAbs(vr, fr, 0.0)); 1148 + try expect(approxEqAbs(vr8, fr8, 0.0)); 1149 + try expect(approxEqAbs(vr16, fr16, 0.0)); 1150 + f += 0.12345 * @as(f32, @floatFromInt(i)); 1151 + } 1152 + } 1153 + 1154 + pub fn ceil(v: anytype) @TypeOf(v) { 1155 + const T = @TypeOf(v); 1156 + if (cpu_arch == .x86_64 and has_avx) { 1157 + if (T == F32x4) { 1158 + return asm ("vroundps $2, %%xmm0, %%xmm0" 1159 + : [ret] "={xmm0}" (-> T), 1160 + : [v] "{xmm0}" (v), 1161 + ); 1162 + } else if (T == F32x8) { 1163 + return asm ("vroundps $2, %%ymm0, %%ymm0" 1164 + : [ret] "={ymm0}" (-> T), 1165 + : [v] "{ymm0}" (v), 1166 + ); 1167 + } else if (T == F32x16 and has_avx512f) { 1168 + return asm ("vrndscaleps $2, %%zmm0, %%zmm0" 1169 + : [ret] "={zmm0}" (-> T), 1170 + : [v] "{zmm0}" (v), 1171 + ); 1172 + } else if (T == F32x16 and !has_avx512f) { 1173 + const arr: [16]f32 = v; 1174 + var ymm0 = @as(F32x8, arr[0..8].*); 1175 + var ymm1 = @as(F32x8, arr[8..16].*); 1176 + ymm0 = asm ("vroundps $2, %%ymm0, %%ymm0" 1177 + : [ret] "={ymm0}" (-> F32x8), 1178 + : [v] "{ymm0}" (ymm0), 1179 + ); 1180 + ymm1 = asm ("vroundps $2, %%ymm1, %%ymm1" 1181 + : [ret] "={ymm1}" (-> F32x8), 1182 + : [v] "{ymm1}" (ymm1), 1183 + ); 1184 + return @shuffle(f32, ymm0, ymm1, [16]i32{ 0, 1, 2, 3, 4, 5, 6, 7, -1, -2, -3, -4, -5, -6, -7, -8 }); 1185 + } 1186 + } else { 1187 + const mask = abs(v) < splatNoFraction(T); 1188 + var result = floatToIntAndBack(v); 1189 + const smaller_mask = result < v; 1190 + const smaller = select(smaller_mask, splat(T, -1.0), splat(T, 0.0)); 1191 + result = result - smaller; 1192 + return select(mask, result, v); 1193 + } 1194 + } 1195 + test "zmath.ceil" { 1196 + { 1197 + try expect(all(ceil(splat(F32x4, math.inf(f32))) == splat(F32x4, math.inf(f32)), 0)); 1198 + try expect(all(ceil(splat(F32x4, -math.inf(f32))) == splat(F32x4, -math.inf(f32)), 0)); 1199 + try expect(all(isNan(ceil(splat(F32x4, math.nan(f32)))), 0)); 1200 + try expect(all(isNan(ceil(splat(F32x4, -math.nan(f32)))), 0)); 1201 + try expect(all(isNan(ceil(splat(F32x4, math.snan(f32)))), 0)); 1202 + try expect(all(isNan(ceil(splat(F32x4, -math.snan(f32)))), 0)); 1203 + } 1204 + { 1205 + const v = ceil(f32x16(1.1, -1.1, -1.5, 1.5, 2.1, 2.8, 2.9, 4.1, 5.8, 6.1, 7.9, 8.9, 10.1, 11.2, 12.7, 13.1)); 1206 + try expect(approxEqAbs( 1207 + v, 1208 + f32x16(2.0, -1.0, -1.0, 2.0, 3.0, 3.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0, 11.0, 12.0, 13.0, 14.0), 1209 + 0.0, 1210 + )); 1211 + } 1212 + var v = ceil(f32x4(1.5, -1.5, -1.7, -2.1)); 1213 + try expect(approxEqAbs(v, f32x4(2.0, -1.0, -1.0, -2.0), 0.0)); 1214 + 1215 + v = ceil(f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32))); 1216 + try expect(approxEqAbs(v, f32x4(-10_000_002.1, -math.inf(f32), 10_000_001.5, math.inf(f32)), 0.0)); 1217 + 1218 + v = ceil(f32x4(-math.snan(f32), math.snan(f32), math.nan(f32), -math.inf(f32))); 1219 + try expect(math.isNan(v[0])); 1220 + try expect(math.isNan(v[1])); 1221 + try expect(math.isNan(v[2])); 1222 + try expect(v[3] == -math.inf(f32)); 1223 + 1224 + v = ceil(f32x4(1000.5001, -201.499, -10000.99, 100.75001)); 1225 + try expect(approxEqAbs(v, f32x4(1001.0, -201.0, -10000.0, 101.0), 0.0)); 1226 + 1227 + v = ceil(f32x4(-1_388_609.5, 1_388_609.1, 1_388_109.9, -1_388_509.9)); 1228 + try expect(approxEqAbs(v, f32x4(-1_388_609.0, 1_388_610.0, 1_388_110.0, -1_388_509.0), 0.0)); 1229 + 1230 + var f: f32 = -100.0; 1231 + var i: u32 = 0; 1232 + while (i < 100) : (i += 1) { 1233 + const vr = ceil(splat(F32x4, f)); 1234 + const fr = @ceil(splat(F32x4, f)); 1235 + const vr8 = ceil(splat(F32x8, f)); 1236 + const fr8 = @ceil(splat(F32x8, f)); 1237 + const vr16 = ceil(splat(F32x16, f)); 1238 + const fr16 = @ceil(splat(F32x16, f)); 1239 + try expect(approxEqAbs(vr, fr, 0.0)); 1240 + try expect(approxEqAbs(vr8, fr8, 0.0)); 1241 + try expect(approxEqAbs(vr16, fr16, 0.0)); 1242 + f += 0.12345 * @as(f32, @floatFromInt(i)); 1243 + } 1244 + } 1245 + 1246 + pub inline fn clamp(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { 1247 + var result = max(vmin, v); 1248 + result = min(vmax, result); 1249 + return result; 1250 + } 1251 + test "zmath.clamp" { 1252 + // Calling math.inf causes test to fail! 1253 + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 1254 + { 1255 + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1256 + const v = clamp(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); 1257 + try expect(approxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001)); 1258 + } 1259 + { 1260 + const v0 = f32x8(-2.0, 0.25, -0.25, 100.0, -1.0, 0.2, 1.1, -0.3); 1261 + const v = clamp(v0, splat(F32x8, -0.5), splat(F32x8, 0.5)); 1262 + try expect(approxEqAbs(v, f32x8(-0.5, 0.25, -0.25, 0.5, -0.5, 0.2, 0.5, -0.3), 0.0001)); 1263 + } 1264 + { 1265 + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); 1266 + const v = clamp(v0, f32x4(-100.0, 0.0, -100.0, 0.0), f32x4(0.0, 100.0, 0.0, 100.0)); 1267 + try expect(approxEqAbs(v, f32x4(-100.0, 100.0, -100.0, 0.0), 0.0001)); 1268 + } 1269 + { 1270 + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); 1271 + const v = clamp(v0, splat(F32x4, -1.0), splat(F32x4, 1.0)); 1272 + try expect(approxEqAbs(v, f32x4(1.0, 1.0, -1.0, -1.0), 0.0001)); 1273 + } 1274 + } 1275 + 1276 + pub inline fn clampFast(v: anytype, vmin: anytype, vmax: anytype) @TypeOf(v, vmin, vmax) { 1277 + var result = maxFast(vmin, v); 1278 + result = minFast(vmax, result); 1279 + return result; 1280 + } 1281 + test "zmath.clampFast" { 1282 + { 1283 + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1284 + const v = clampFast(v0, splat(F32x4, -0.5), splat(F32x4, 0.5)); 1285 + try expect(approxEqAbs(v, f32x4(-0.5, 0.2, 0.5, -0.3), 0.0001)); 1286 + } 1287 + } 1288 + 1289 + pub inline fn saturate(v: anytype) @TypeOf(v) { 1290 + const T = @TypeOf(v); 1291 + var result = max(v, splat(T, 0.0)); 1292 + result = min(result, splat(T, 1.0)); 1293 + return result; 1294 + } 1295 + test "zmath.saturate" { 1296 + // Calling math.inf causes test to fail! 1297 + if (builtin.target.os.tag == .macos and builtin.target.cpu.arch == .aarch64) return error.SkipZigTest; 1298 + { 1299 + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1300 + const v = saturate(v0); 1301 + try expect(approxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001)); 1302 + } 1303 + { 1304 + const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); 1305 + const v = saturate(v0); 1306 + try expect(approxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001)); 1307 + } 1308 + { 1309 + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); 1310 + const v = saturate(v0); 1311 + try expect(approxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001)); 1312 + } 1313 + { 1314 + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); 1315 + const v = saturate(v0); 1316 + try expect(approxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001)); 1317 + } 1318 + } 1319 + 1320 + pub inline fn saturateFast(v: anytype) @TypeOf(v) { 1321 + const T = @TypeOf(v); 1322 + var result = maxFast(v, splat(T, 0.0)); 1323 + result = minFast(result, splat(T, 1.0)); 1324 + return result; 1325 + } 1326 + test "zmath.saturateFast" { 1327 + { 1328 + const v0 = f32x4(-1.0, 0.2, 1.1, -0.3); 1329 + const v = saturateFast(v0); 1330 + try expect(approxEqAbs(v, f32x4(0.0, 0.2, 1.0, 0.0), 0.0001)); 1331 + } 1332 + { 1333 + const v0 = f32x8(0.0, 0.0, 2.0, -2.0, -1.0, 0.2, 1.1, -0.3); 1334 + const v = saturateFast(v0); 1335 + try expect(approxEqAbs(v, f32x8(0.0, 0.0, 1.0, 0.0, 0.0, 0.2, 1.0, 0.0), 0.0001)); 1336 + } 1337 + { 1338 + const v0 = f32x4(-math.inf(f32), math.inf(f32), math.nan(f32), math.snan(f32)); 1339 + const v = saturateFast(v0); 1340 + try expect(approxEqAbs(v, f32x4(0.0, 1.0, 0.0, 0.0), 0.0001)); 1341 + } 1342 + { 1343 + const v0 = f32x4(math.inf(f32), math.inf(f32), -math.nan(f32), -math.snan(f32)); 1344 + const v = saturateFast(v0); 1345 + try expect(approxEqAbs(v, f32x4(1.0, 1.0, 0.0, 0.0), 0.0001)); 1346 + } 1347 + } 1348 + 1349 + pub inline fn sqrt(v: anytype) @TypeOf(v) { 1350 + return @sqrt(v); // sqrtps 1351 + } 1352 + 1353 + pub inline fn abs(v: anytype) @TypeOf(v) { 1354 + return @abs(v); // load, andps 1355 + } 1356 + 1357 + pub inline fn select(mask: anytype, v0: anytype, v1: anytype) @TypeOf(v0, v1) { 1358 + return @select(f32, mask, v0, v1); 1359 + } 1360 + 1361 + pub inline fn lerp(v0: anytype, v1: anytype, t: f32) @TypeOf(v0, v1) { 1362 + const T = @TypeOf(v0, v1); 1363 + return v0 + (v1 - v0) * splat(T, t); // subps, shufps, addps, mulps 1364 + } 1365 + 1366 + pub inline fn lerpV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { 1367 + return v0 + (v1 - v0) * t; // subps, addps, mulps 1368 + } 1369 + 1370 + pub inline fn lerpInverse(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1) { 1371 + const T = @TypeOf(v0, v1); 1372 + return (splat(T, t) - v0) / (v1 - v0); 1373 + } 1374 + 1375 + pub inline fn lerpInverseV(v0: anytype, v1: anytype, t: anytype) @TypeOf(v0, v1, t) { 1376 + return (t - v0) / (v1 - v0); 1377 + } 1378 + test "zmath.lerpInverse" { 1379 + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 10.0), 0, 0.0005)); 1380 + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 100.0), 1, 0.0005)); 1381 + try expect(math.approxEqAbs(f32, lerpInverseV(10.0, 100.0, 55.0), 0.5, 0.05)); 1382 + try expect(approxEqAbs(lerpInverse(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 10.0), f32x4(0.1, 0.05, 0, 0), 0.0005)); 1383 + } 1384 + 1385 + // Frame rate independent lerp (or "damp"), for approaching things over time. 1386 + // Reference: https://www.gamedeveloper.com/programming/improved-lerp-smoothing- 1387 + pub inline fn lerpOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1) { 1388 + const t = std.math.exp2(-rate * dt); 1389 + return lerp(v0, v1, t); 1390 + } 1391 + 1392 + pub inline fn lerpVOverTime(v0: anytype, v1: anytype, rate: anytype, dt: anytype) @TypeOf(v0, v1, rate, dt) { 1393 + const t = std.math.exp2(-rate * dt); 1394 + return lerpV(v0, v1, t); 1395 + } 1396 + test "zmath.lerpOverTime" { 1397 + try expect(math.approxEqAbs(f32, lerpVOverTime(0.0, 1.0, 1.0, 1.0), 0.5, 0.0005)); 1398 + try expect(math.approxEqAbs(f32, lerpVOverTime(0.5, 1.0, 1.0, 1.0), 0.75, 0.0005)); 1399 + try expect(approxEqAbs(lerpOverTime(f32x4(0, 0, 10, 10), f32x4(100, 200, 100, 100), 1.0, 1.0), f32x4(50, 100, 55, 55), 0.0005)); 1400 + } 1401 + 1402 + /// To transform a vector of values from one range to another. 1403 + pub inline fn mapLinear(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v) { 1404 + const T = @TypeOf(v); 1405 + const min1V = splat(T, min1); 1406 + const max1V = splat(T, max1); 1407 + const min2V = splat(T, min2); 1408 + const max2V = splat(T, max2); 1409 + const dV = max1V - min1V; 1410 + return min2V + (v - min1V) * (max2V - min2V) / dV; 1411 + } 1412 + 1413 + pub inline fn mapLinearV(v: anytype, min1: anytype, max1: anytype, min2: anytype, max2: anytype) @TypeOf(v, min1, max1, min2, max2) { 1414 + const d = max1 - min1; 1415 + return min2 + (v - min1) * (max2 - min2) / d; 1416 + } 1417 + test "zmath.mapLinear" { 1418 + try expect(math.approxEqAbs(f32, mapLinearV(0, 0, 1.2, 10, 100), 10, 0.0005)); 1419 + try expect(math.approxEqAbs(f32, mapLinearV(1.2, 0, 1.2, 10, 100), 100, 0.0005)); 1420 + try expect(math.approxEqAbs(f32, mapLinearV(0.6, 0, 1.2, 10, 100), 55, 0.0005)); 1421 + try expect(approxEqAbs(mapLinearV(splat(F32x4, 0), splat(F32x4, 0), splat(F32x4, 1.2), splat(F32x4, 10), splat(F32x4, 100)), splat(F32x4, 10), 0.0005)); 1422 + try expect(approxEqAbs(mapLinear(f32x4(0, 0, 0.6, 1.2), 0, 1.2, 10, 100), f32x4(10, 10, 55, 100), 0.0005)); 1423 + } 1424 + 1425 + pub const F32x4Component = enum { x, y, z, w }; 1426 + 1427 + pub inline fn swizzle( 1428 + v: F32x4, 1429 + comptime x: F32x4Component, 1430 + comptime y: F32x4Component, 1431 + comptime z: F32x4Component, 1432 + comptime w: F32x4Component, 1433 + ) F32x4 { 1434 + return @shuffle(f32, v, undefined, [4]i32{ @intFromEnum(x), @intFromEnum(y), @intFromEnum(z), @intFromEnum(w) }); 1435 + } 1436 + 1437 + pub inline fn mod(v0: anytype, v1: anytype) @TypeOf(v0, v1) { 1438 + // vdivps, vroundps, vmulps, vsubps 1439 + return v0 - v1 * trunc(v0 / v1); 1440 + } 1441 + test "zmath.mod" { 1442 + try expect(approxEqAbs(mod(splat(F32x4, 3.1), splat(F32x4, 1.7)), splat(F32x4, 1.4), 0.0005)); 1443 + try expect(approxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, 2.0)), splat(F32x4, -1.0), 0.0005)); 1444 + try expect(approxEqAbs(mod(splat(F32x4, -3.0), splat(F32x4, -2.0)), splat(F32x4, -1.0), 0.0005)); 1445 + try expect(approxEqAbs(mod(splat(F32x4, 3.0), splat(F32x4, -2.0)), splat(F32x4, 1.0), 0.0005)); 1446 + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, 1.0))), 0)); 1447 + try expect(all(isNan(mod(splat(F32x4, -math.inf(f32)), splat(F32x4, 123.456))), 0)); 1448 + try expect(all(isNan(mod(splat(F32x4, math.nan(f32)), splat(F32x4, 123.456))), 0)); 1449 + try expect(all(isNan(mod(splat(F32x4, math.snan(f32)), splat(F32x4, 123.456))), 0)); 1450 + try expect(all(isNan(mod(splat(F32x4, -math.snan(f32)), splat(F32x4, 123.456))), 0)); 1451 + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.inf(f32)))), 0)); 1452 + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, -math.inf(f32)))), 0)); 1453 + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32)))), 0)); 1454 + try expect(all(isNan(mod(splat(F32x4, 123.456), splat(F32x4, math.nan(f32)))), 0)); 1455 + try expect(all(isNan(mod(splat(F32x4, math.inf(f32)), splat(F32x4, math.nan(f32)))), 0)); 1456 + } 1457 + 1458 + pub fn modAngle(v: anytype) @TypeOf(v) { 1459 + const T = @TypeOf(v); 1460 + return switch (T) { 1461 + f32 => modAngle32(v), 1462 + F32x4, F32x8, F32x16 => modAngle32xN(v), 1463 + else => @compileError("zmath.modAngle() not implemented for " ++ @typeName(T)), 1464 + }; 1465 + } 1466 + 1467 + pub inline fn modAngle32xN(v: anytype) @TypeOf(v) { 1468 + const T = @TypeOf(v); 1469 + return v - splat(T, math.tau) * round(v * splat(T, 1.0 / math.tau)); // 2 x vmulps, 2 x load, vroundps, vaddps 1470 + } 1471 + test "zmath.modAngle" { 1472 + try expect(approxEqAbs(modAngle(splat(F32x4, math.tau)), splat(F32x4, 0.0), 0.0005)); 1473 + try expect(approxEqAbs(modAngle(splat(F32x4, 0.0)), splat(F32x4, 0.0), 0.0005)); 1474 + try expect(approxEqAbs(modAngle(splat(F32x4, math.pi)), splat(F32x4, math.pi), 0.0005)); 1475 + try expect(approxEqAbs(modAngle(splat(F32x4, 11 * math.pi)), splat(F32x4, math.pi), 0.0005)); 1476 + try expect(approxEqAbs(modAngle(splat(F32x4, 3.5 * math.pi)), splat(F32x4, -0.5 * math.pi), 0.0005)); 1477 + try expect(approxEqAbs(modAngle(splat(F32x4, 2.5 * math.pi)), splat(F32x4, 0.5 * math.pi), 0.0005)); 1478 + } 1479 + 1480 + pub inline fn mulAdd(v0: anytype, v1: anytype, v2: anytype) @TypeOf(v0, v1, v2) { 1481 + const T = @TypeOf(v0, v1, v2); 1482 + if (@import("zmath_options").enable_cross_platform_determinism) { 1483 + return v0 * v1 + v2; // Compiler will generate mul, add sequence (no fma even if the target supports it). 1484 + } else { 1485 + if (cpu_arch == .x86_64 and has_avx and has_fma) { 1486 + return @mulAdd(T, v0, v1, v2); 1487 + } else { 1488 + // NOTE(mziulek): On .x86_64 without HW fma instructions @mulAdd maps to really slow code! 1489 + return v0 * v1 + v2; 1490 + } 1491 + } 1492 + } 1493 + 1494 + fn sin32xN(v: anytype) @TypeOf(v) { 1495 + // 11-degree minimax approximation 1496 + const T = @TypeOf(v); 1497 + 1498 + var x = modAngle(v); 1499 + const sign = andInt(x, splatNegativeZero(T)); 1500 + const c = orInt(sign, splat(T, math.pi)); 1501 + const absx = andNotInt(sign, x); 1502 + const rflx = c - x; 1503 + const comp = absx <= splat(T, 0.5 * math.pi); 1504 + x = select(comp, x, rflx); 1505 + const x2 = x * x; 1506 + 1507 + var result = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); 1508 + result = mulAdd(result, x2, splat(T, -0.00019840874)); 1509 + result = mulAdd(result, x2, splat(T, 0.0083333310)); 1510 + result = mulAdd(result, x2, splat(T, -0.16666667)); 1511 + result = mulAdd(result, x2, splat(T, 1.0)); 1512 + return x * result; 1513 + } 1514 + test "zmath.sin" { 1515 + const epsilon = 0.0001; 1516 + 1517 + try expect(approxEqAbs(sin(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 1.0), epsilon)); 1518 + try expect(approxEqAbs(sin(splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon)); 1519 + try expect(approxEqAbs(sin(splat(F32x4, -0.0)), splat(F32x4, -0.0), epsilon)); 1520 + try expect(approxEqAbs(sin(splat(F32x4, 89.123)), splat(F32x4, 0.916166), epsilon)); 1521 + try expect(approxEqAbs(sin(splat(F32x8, 89.123)), splat(F32x8, 0.916166), epsilon)); 1522 + try expect(approxEqAbs(sin(splat(F32x16, 89.123)), splat(F32x16, 0.916166), epsilon)); 1523 + try expect(all(isNan(sin(splat(F32x4, math.inf(f32)))), 0) == true); 1524 + try expect(all(isNan(sin(splat(F32x4, -math.inf(f32)))), 0) == true); 1525 + try expect(all(isNan(sin(splat(F32x4, math.nan(f32)))), 0) == true); 1526 + try expect(all(isNan(sin(splat(F32x4, math.snan(f32)))), 0) == true); 1527 + 1528 + var f: f32 = -100.0; 1529 + var i: u32 = 0; 1530 + while (i < 100) : (i += 1) { 1531 + const vr = sin(splat(F32x4, f)); 1532 + const fr = @sin(splat(F32x4, f)); 1533 + const vr8 = sin(splat(F32x8, f)); 1534 + const fr8 = @sin(splat(F32x8, f)); 1535 + const vr16 = sin(splat(F32x16, f)); 1536 + const fr16 = @sin(splat(F32x16, f)); 1537 + try expect(approxEqAbs(vr, fr, epsilon)); 1538 + try expect(approxEqAbs(vr8, fr8, epsilon)); 1539 + try expect(approxEqAbs(vr16, fr16, epsilon)); 1540 + f += 0.12345 * @as(f32, @floatFromInt(i)); 1541 + } 1542 + } 1543 + 1544 + fn cos32xN(v: anytype) @TypeOf(v) { 1545 + // 10-degree minimax approximation 1546 + const T = @TypeOf(v); 1547 + 1548 + var x = modAngle(v); 1549 + var sign = andInt(x, splatNegativeZero(T)); 1550 + const c = orInt(sign, splat(T, math.pi)); 1551 + const absx = andNotInt(sign, x); 1552 + const rflx = c - x; 1553 + const comp = absx <= splat(T, 0.5 * math.pi); 1554 + x = select(comp, x, rflx); 1555 + sign = select(comp, splat(T, 1.0), splat(T, -1.0)); 1556 + const x2 = x * x; 1557 + 1558 + var result = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); 1559 + result = mulAdd(result, x2, splat(T, -0.0013888378)); 1560 + result = mulAdd(result, x2, splat(T, 0.041666638)); 1561 + result = mulAdd(result, x2, splat(T, -0.5)); 1562 + result = mulAdd(result, x2, splat(T, 1.0)); 1563 + return sign * result; 1564 + } 1565 + test "zmath.cos" { 1566 + const epsilon = 0.0001; 1567 + 1568 + try expect(approxEqAbs(cos(splat(F32x4, 0.5 * math.pi)), splat(F32x4, 0.0), epsilon)); 1569 + try expect(approxEqAbs(cos(splat(F32x4, 0.0)), splat(F32x4, 1.0), epsilon)); 1570 + try expect(approxEqAbs(cos(splat(F32x4, -0.0)), splat(F32x4, 1.0), epsilon)); 1571 + try expect(all(isNan(cos(splat(F32x4, math.inf(f32)))), 0) == true); 1572 + try expect(all(isNan(cos(splat(F32x4, -math.inf(f32)))), 0) == true); 1573 + try expect(all(isNan(cos(splat(F32x4, math.nan(f32)))), 0) == true); 1574 + try expect(all(isNan(cos(splat(F32x4, math.snan(f32)))), 0) == true); 1575 + 1576 + var f: f32 = -100.0; 1577 + var i: u32 = 0; 1578 + while (i < 100) : (i += 1) { 1579 + const vr = cos(splat(F32x4, f)); 1580 + const fr = @cos(splat(F32x4, f)); 1581 + const vr8 = cos(splat(F32x8, f)); 1582 + const fr8 = @cos(splat(F32x8, f)); 1583 + const vr16 = cos(splat(F32x16, f)); 1584 + const fr16 = @cos(splat(F32x16, f)); 1585 + try expect(approxEqAbs(vr, fr, epsilon)); 1586 + try expect(approxEqAbs(vr8, fr8, epsilon)); 1587 + try expect(approxEqAbs(vr16, fr16, epsilon)); 1588 + f += 0.12345 * @as(f32, @floatFromInt(i)); 1589 + } 1590 + } 1591 + 1592 + pub fn sin(v: anytype) @TypeOf(v) { 1593 + const T = @TypeOf(v); 1594 + return switch (T) { 1595 + f32 => sin32(v), 1596 + F32x4, F32x8, F32x16 => sin32xN(v), 1597 + else => @compileError("zmath.sin() not implemented for " ++ @typeName(T)), 1598 + }; 1599 + } 1600 + 1601 + pub fn cos(v: anytype) @TypeOf(v) { 1602 + const T = @TypeOf(v); 1603 + return switch (T) { 1604 + f32 => cos32(v), 1605 + F32x4, F32x8, F32x16 => cos32xN(v), 1606 + else => @compileError("zmath.cos() not implemented for " ++ @typeName(T)), 1607 + }; 1608 + } 1609 + 1610 + pub fn sincos(v: anytype) [2]@TypeOf(v) { 1611 + const T = @TypeOf(v); 1612 + return switch (T) { 1613 + f32 => sincos32(v), 1614 + F32x4, F32x8, F32x16 => sincos32xN(v), 1615 + else => @compileError("zmath.sincos() not implemented for " ++ @typeName(T)), 1616 + }; 1617 + } 1618 + 1619 + pub fn asin(v: anytype) @TypeOf(v) { 1620 + const T = @TypeOf(v); 1621 + return switch (T) { 1622 + f32 => asin32(v), 1623 + F32x4, F32x8, F32x16 => asin32xN(v), 1624 + else => @compileError("zmath.asin() not implemented for " ++ @typeName(T)), 1625 + }; 1626 + } 1627 + 1628 + pub fn acos(v: anytype) @TypeOf(v) { 1629 + const T = @TypeOf(v); 1630 + return switch (T) { 1631 + f32 => acos32(v), 1632 + F32x4, F32x8, F32x16 => acos32xN(v), 1633 + else => @compileError("zmath.acos() not implemented for " ++ @typeName(T)), 1634 + }; 1635 + } 1636 + 1637 + fn sincos32xN(v: anytype) [2]@TypeOf(v) { 1638 + const T = @TypeOf(v); 1639 + 1640 + var x = modAngle(v); 1641 + var sign = andInt(x, splatNegativeZero(T)); 1642 + const c = orInt(sign, splat(T, math.pi)); 1643 + const absx = andNotInt(sign, x); 1644 + const rflx = c - x; 1645 + const comp = absx <= splat(T, 0.5 * math.pi); 1646 + x = select(comp, x, rflx); 1647 + sign = select(comp, splat(T, 1.0), splat(T, -1.0)); 1648 + const x2 = x * x; 1649 + 1650 + var sresult = mulAdd(splat(T, -2.3889859e-08), x2, splat(T, 2.7525562e-06)); 1651 + sresult = mulAdd(sresult, x2, splat(T, -0.00019840874)); 1652 + sresult = mulAdd(sresult, x2, splat(T, 0.0083333310)); 1653 + sresult = mulAdd(sresult, x2, splat(T, -0.16666667)); 1654 + sresult = x * mulAdd(sresult, x2, splat(T, 1.0)); 1655 + 1656 + var cresult = mulAdd(splat(T, -2.6051615e-07), x2, splat(T, 2.4760495e-05)); 1657 + cresult = mulAdd(cresult, x2, splat(T, -0.0013888378)); 1658 + cresult = mulAdd(cresult, x2, splat(T, 0.041666638)); 1659 + cresult = mulAdd(cresult, x2, splat(T, -0.5)); 1660 + cresult = sign * mulAdd(cresult, x2, splat(T, 1.0)); 1661 + 1662 + return .{ sresult, cresult }; 1663 + } 1664 + test "zmath.sincos32xN" { 1665 + const epsilon = 0.0001; 1666 + 1667 + var f: f32 = -100.0; 1668 + var i: u32 = 0; 1669 + while (i < 100) : (i += 1) { 1670 + const sc = sincos(splat(F32x4, f)); 1671 + const sc8 = sincos(splat(F32x8, f)); 1672 + const sc16 = sincos(splat(F32x16, f)); 1673 + const s4 = @sin(splat(F32x4, f)); 1674 + const s8 = @sin(splat(F32x8, f)); 1675 + const s16 = @sin(splat(F32x16, f)); 1676 + const c4 = @cos(splat(F32x4, f)); 1677 + const c8 = @cos(splat(F32x8, f)); 1678 + const c16 = @cos(splat(F32x16, f)); 1679 + try expect(approxEqAbs(sc[0], s4, epsilon)); 1680 + try expect(approxEqAbs(sc8[0], s8, epsilon)); 1681 + try expect(approxEqAbs(sc16[0], s16, epsilon)); 1682 + try expect(approxEqAbs(sc[1], c4, epsilon)); 1683 + try expect(approxEqAbs(sc8[1], c8, epsilon)); 1684 + try expect(approxEqAbs(sc16[1], c16, epsilon)); 1685 + f += 0.12345 * @as(f32, @floatFromInt(i)); 1686 + } 1687 + } 1688 + 1689 + fn asin32xN(v: anytype) @TypeOf(v) { 1690 + // 7-degree minimax approximation 1691 + const T = @TypeOf(v); 1692 + 1693 + const x = abs(v); 1694 + const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); 1695 + 1696 + var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); 1697 + t0 = mulAdd(t0, x, splat(T, -0.0170881256)); 1698 + t0 = mulAdd(t0, x, splat(T, 0.0308918810)); 1699 + t0 = mulAdd(t0, x, splat(T, -0.0501743046)); 1700 + t0 = mulAdd(t0, x, splat(T, 0.0889789874)); 1701 + t0 = mulAdd(t0, x, splat(T, -0.2145988016)); 1702 + t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); 1703 + 1704 + const t1 = splat(T, math.pi) - t0; 1705 + return splat(T, 0.5 * math.pi) - select(v >= splat(T, 0.0), t0, t1); 1706 + } 1707 + 1708 + fn acos32xN(v: anytype) @TypeOf(v) { 1709 + // 7-degree minimax approximation 1710 + const T = @TypeOf(v); 1711 + 1712 + const x = abs(v); 1713 + const root = sqrt(maxFast(splat(T, 0.0), splat(T, 1.0) - x)); 1714 + 1715 + var t0 = mulAdd(splat(T, -0.0012624911), x, splat(T, 0.0066700901)); 1716 + t0 = mulAdd(t0, x, splat(T, -0.0170881256)); 1717 + t0 = mulAdd(t0, x, splat(T, 0.0308918810)); 1718 + t0 = mulAdd(t0, x, splat(T, -0.0501743046)); 1719 + t0 = mulAdd(t0, x, splat(T, 0.0889789874)); 1720 + t0 = mulAdd(t0, x, splat(T, -0.2145988016)); 1721 + t0 = root * mulAdd(t0, x, splat(T, 1.5707963050)); 1722 + 1723 + const t1 = splat(T, math.pi) - t0; 1724 + return select(v >= splat(T, 0.0), t0, t1); 1725 + } 1726 + 1727 + pub fn atan(v: anytype) @TypeOf(v) { 1728 + // 17-degree minimax approximation 1729 + const T = @TypeOf(v); 1730 + 1731 + const vabs = abs(v); 1732 + const vinv = splat(T, 1.0) / v; 1733 + var sign = select(v > splat(T, 1.0), splat(T, 1.0), splat(T, -1.0)); 1734 + const comp = vabs <= splat(T, 1.0); 1735 + sign = select(comp, splat(T, 0.0), sign); 1736 + const x = select(comp, v, vinv); 1737 + const x2 = x * x; 1738 + 1739 + var result = mulAdd(splat(T, 0.0028662257), x2, splat(T, -0.0161657367)); 1740 + result = mulAdd(result, x2, splat(T, 0.0429096138)); 1741 + result = mulAdd(result, x2, splat(T, -0.0752896400)); 1742 + result = mulAdd(result, x2, splat(T, 0.1065626393)); 1743 + result = mulAdd(result, x2, splat(T, -0.1420889944)); 1744 + result = mulAdd(result, x2, splat(T, 0.1999355085)); 1745 + result = mulAdd(result, x2, splat(T, -0.3333314528)); 1746 + result = x * mulAdd(result, x2, splat(T, 1.0)); 1747 + 1748 + const result1 = sign * splat(T, 0.5 * math.pi) - result; 1749 + return select(sign == splat(T, 0.0), result, result1); 1750 + } 1751 + test "zmath.atan" { 1752 + const epsilon = 0.0001; 1753 + { 1754 + const v = f32x4(0.25, 0.5, 1.0, 1.25); 1755 + const e = f32x4(math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3])); 1756 + try expect(approxEqAbs(e, atan(v), epsilon)); 1757 + } 1758 + { 1759 + const v = f32x8(-0.25, 0.5, -1.0, 1.25, 100.0, -200.0, 300.0, 400.0); 1760 + // zig fmt: off 1761 + const e = f32x8( 1762 + math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), 1763 + math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), 1764 + ); 1765 + // zig fmt: on 1766 + try expect(approxEqAbs(e, atan(v), epsilon)); 1767 + } 1768 + { 1769 + // zig fmt: off 1770 + const v = f32x16( 1771 + -0.25, 0.5, -1.0, 0.0, 0.1, -0.2, 30.0, 400.0, 1772 + -0.25, 0.5, -1.0, -0.0, -0.05, -0.125, 0.0625, 4000.0 1773 + ); 1774 + const e = f32x16( 1775 + math.atan(v[0]), math.atan(v[1]), math.atan(v[2]), math.atan(v[3]), 1776 + math.atan(v[4]), math.atan(v[5]), math.atan(v[6]), math.atan(v[7]), 1777 + math.atan(v[8]), math.atan(v[9]), math.atan(v[10]), math.atan(v[11]), 1778 + math.atan(v[12]), math.atan(v[13]), math.atan(v[14]), math.atan(v[15]), 1779 + ); 1780 + // zig fmt: on 1781 + try expect(approxEqAbs(e, atan(v), epsilon)); 1782 + } 1783 + { 1784 + try expect(approxEqAbs(atan(splat(F32x4, math.inf(f32))), splat(F32x4, 0.5 * math.pi), epsilon)); 1785 + try expect(approxEqAbs(atan(splat(F32x4, -math.inf(f32))), splat(F32x4, -0.5 * math.pi), epsilon)); 1786 + try expect(all(isNan(atan(splat(F32x4, math.nan(f32)))), 0) == true); 1787 + try expect(all(isNan(atan(splat(F32x4, -math.nan(f32)))), 0) == true); 1788 + } 1789 + } 1790 + 1791 + pub fn atan2(vy: anytype, vx: anytype) @TypeOf(vx, vy) { 1792 + const T = @TypeOf(vx, vy); 1793 + const Tu = @Vector(veclen(T), u32); 1794 + 1795 + const vx_is_positive = 1796 + (@as(Tu, @bitCast(vx)) & @as(Tu, @splat(0x8000_0000))) == @as(Tu, @splat(0)); 1797 + 1798 + const vy_sign = andInt(vy, splatNegativeZero(T)); 1799 + const c0_25pi = orInt(vy_sign, @as(T, @splat(0.25 * math.pi))); 1800 + const c0_50pi = orInt(vy_sign, @as(T, @splat(0.50 * math.pi))); 1801 + const c0_75pi = orInt(vy_sign, @as(T, @splat(0.75 * math.pi))); 1802 + const c1_00pi = orInt(vy_sign, @as(T, @splat(1.00 * math.pi))); 1803 + 1804 + var r1 = select(vx_is_positive, vy_sign, c1_00pi); 1805 + var r2 = select(vx == splat(T, 0.0), c0_50pi, splatInt(T, 0xffff_ffff)); 1806 + const r3 = select(vy == splat(T, 0.0), r1, r2); 1807 + const r4 = select(vx_is_positive, c0_25pi, c0_75pi); 1808 + const r5 = select(isInf(vx), r4, c0_50pi); 1809 + const result = select(isInf(vy), r5, r3); 1810 + const result_valid = @as(Tu, @bitCast(result)) == @as(Tu, @splat(0xffff_ffff)); 1811 + 1812 + const v = vy / vx; 1813 + const r0 = atan(v); 1814 + 1815 + r1 = select(vx_is_positive, splatNegativeZero(T), c1_00pi); 1816 + r2 = r0 + r1; 1817 + 1818 + return select(result_valid, r2, result); 1819 + } 1820 + test "zmath.atan2" { 1821 + // From DirectXMath XMVectorATan2(): 1822 + // 1823 + // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: 1824 + 1825 + // Y == 0 and X is Negative -> Pi with the sign of Y 1826 + // y == 0 and x is positive -> 0 with the sign of y 1827 + // Y != 0 and X == 0 -> Pi / 2 with the sign of Y 1828 + // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) 1829 + // X == -Infinity and Finite Y -> Pi with the sign of Y 1830 + // X == +Infinity and Finite Y -> 0 with the sign of Y 1831 + // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y 1832 + // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y 1833 + // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y 1834 + 1835 + const epsilon = 0.0001; 1836 + try expect(approxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, -1.0)), splat(F32x4, math.pi), epsilon)); 1837 + try expect(approxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, -1.0)), splat(F32x4, -math.pi), epsilon)); 1838 + try expect(approxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, 0.0)), splat(F32x4, 0.5 * math.pi), epsilon)); 1839 + try expect(approxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, 0.0)), splat(F32x4, -0.5 * math.pi), epsilon)); 1840 + try expect(approxEqAbs( 1841 + atan2(splat(F32x4, 1.0), splat(F32x4, -1.0)), 1842 + splat(F32x4, math.atan(@as(f32, -1.0)) + math.pi), 1843 + epsilon, 1844 + )); 1845 + try expect(approxEqAbs( 1846 + atan2(splat(F32x4, -10.0), splat(F32x4, -2.0)), 1847 + splat(F32x4, math.atan(@as(f32, 5.0)) - math.pi), 1848 + epsilon, 1849 + )); 1850 + try expect(approxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, math.pi), epsilon)); 1851 + try expect(approxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, -math.inf(f32))), splat(F32x4, -math.pi), epsilon)); 1852 + try expect(approxEqAbs(atan2(splat(F32x4, 1.0), splat(F32x4, math.inf(f32))), splat(F32x4, 0.0), epsilon)); 1853 + try expect(approxEqAbs(atan2(splat(F32x4, -1.0), splat(F32x4, math.inf(f32))), splat(F32x4, -0.0), epsilon)); 1854 + try expect(approxEqAbs( 1855 + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, 2.0)), 1856 + splat(F32x4, 0.5 * math.pi), 1857 + epsilon, 1858 + )); 1859 + try expect(approxEqAbs( 1860 + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, 2.0)), 1861 + splat(F32x4, -0.5 * math.pi), 1862 + epsilon, 1863 + )); 1864 + try expect(approxEqAbs( 1865 + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, -math.inf(f32))), 1866 + splat(F32x4, 0.75 * math.pi), 1867 + epsilon, 1868 + )); 1869 + try expect(approxEqAbs( 1870 + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, -math.inf(f32))), 1871 + splat(F32x4, -0.75 * math.pi), 1872 + epsilon, 1873 + )); 1874 + try expect(approxEqAbs( 1875 + atan2(splat(F32x4, math.inf(f32)), splat(F32x4, math.inf(f32))), 1876 + splat(F32x4, 0.25 * math.pi), 1877 + epsilon, 1878 + )); 1879 + try expect(approxEqAbs( 1880 + atan2(splat(F32x4, -math.inf(f32)), splat(F32x4, math.inf(f32))), 1881 + splat(F32x4, -0.25 * math.pi), 1882 + epsilon, 1883 + )); 1884 + try expect(approxEqAbs( 1885 + atan2( 1886 + f32x8(0.0, -math.inf(f32), -0.0, 2.0, math.inf(f32), math.inf(f32), 1.0, -math.inf(f32)), 1887 + f32x8(-2.0, math.inf(f32), 1.0, 0.0, 10.0, -math.inf(f32), 1.0, -math.inf(f32)), 1888 + ), 1889 + f32x8( 1890 + math.pi, 1891 + -0.25 * math.pi, 1892 + -0.0, 1893 + 0.5 * math.pi, 1894 + 0.5 * math.pi, 1895 + 0.75 * math.pi, 1896 + math.atan(@as(f32, 1.0)), 1897 + -0.75 * math.pi, 1898 + ), 1899 + epsilon, 1900 + )); 1901 + try expect(approxEqAbs(atan2(splat(F32x4, 0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon)); 1902 + try expect(approxEqAbs(atan2(splat(F32x4, -0.0), splat(F32x4, 0.0)), splat(F32x4, 0.0), epsilon)); 1903 + try expect(all(isNan(atan2(splat(F32x4, 1.0), splat(F32x4, math.nan(f32)))), 0) == true); 1904 + try expect(all(isNan(atan2(splat(F32x4, -1.0), splat(F32x4, math.nan(f32)))), 0) == true); 1905 + try expect(all(isNan(atan2(splat(F32x4, math.nan(f32)), splat(F32x4, -1.0))), 0) == true); 1906 + try expect(all(isNan(atan2(splat(F32x4, -math.nan(f32)), splat(F32x4, 1.0))), 0) == true); 1907 + } 1908 + // ------------------------------------------------------------------------------ 1909 + // 1910 + // 3. 2D, 3D, 4D vector functions 1911 + // 1912 + // ------------------------------------------------------------------------------ 1913 + pub inline fn dot2(v0: Vec, v1: Vec) F32x4 { 1914 + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- | 1915 + const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- | 1916 + xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- | 1917 + return swizzle(xmm0, .x, .x, .x, .x); 1918 + } 1919 + test "zmath.dot2" { 1920 + const v0 = f32x4(-1.0, 2.0, 300.0, -2.0); 1921 + const v1 = f32x4(4.0, 5.0, 600.0, 2.0); 1922 + const v = dot2(v0, v1); 1923 + try expect(approxEqAbs(v, splat(F32x4, 6.0), 0.0001)); 1924 + } 1925 + 1926 + pub inline fn dot3(v0: Vec, v1: Vec) F32x4 { 1927 + const dot = v0 * v1; 1928 + return f32x4s(dot[0] + dot[1] + dot[2]); 1929 + } 1930 + test "zmath.dot3" { 1931 + const v0 = f32x4(-1.0, 2.0, 3.0, 1.0); 1932 + const v1 = f32x4(4.0, 5.0, 6.0, 1.0); 1933 + const v = dot3(v0, v1); 1934 + try expect(approxEqAbs(v, splat(F32x4, 24.0), 0.0001)); 1935 + } 1936 + 1937 + pub inline fn dot4(v0: Vec, v1: Vec) F32x4 { 1938 + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | 1939 + var xmm1 = swizzle(xmm0, .y, .x, .w, .x); // | y0*y1 | -- | w0*w1 | -- | 1940 + xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | -- | z0*z1 + w0*w1 | -- | 1941 + xmm0 = swizzle(xmm1, .z, .x, .x, .x); // | z0*z1 + w0*w1 | -- | -- | -- | 1942 + xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[2]); // addss 1943 + return swizzle(xmm0, .x, .x, .x, .x); 1944 + } 1945 + test "zmath.dot4" { 1946 + const v0 = f32x4(-1.0, 2.0, 3.0, -2.0); 1947 + const v1 = f32x4(4.0, 5.0, 6.0, 2.0); 1948 + const v = dot4(v0, v1); 1949 + try expect(approxEqAbs(v, splat(F32x4, 20.0), 0.0001)); 1950 + } 1951 + 1952 + pub inline fn cross3(v0: Vec, v1: Vec) Vec { 1953 + var xmm0 = swizzle(v0, .y, .z, .x, .w); 1954 + var xmm1 = swizzle(v1, .z, .x, .y, .w); 1955 + var result = xmm0 * xmm1; 1956 + xmm0 = swizzle(xmm0, .y, .z, .x, .w); 1957 + xmm1 = swizzle(xmm1, .z, .x, .y, .w); 1958 + result = result - xmm0 * xmm1; 1959 + return andInt(result, f32x4_mask3); 1960 + } 1961 + test "zmath.cross3" { 1962 + { 1963 + const v0 = f32x4(1.0, 0.0, 0.0, 1.0); 1964 + const v1 = f32x4(0.0, 1.0, 0.0, 1.0); 1965 + const v = cross3(v0, v1); 1966 + try expect(approxEqAbs(v, f32x4(0.0, 0.0, 1.0, 0.0), 0.0001)); 1967 + } 1968 + { 1969 + const v0 = f32x4(1.0, 0.0, 0.0, 1.0); 1970 + const v1 = f32x4(0.0, -1.0, 0.0, 1.0); 1971 + const v = cross3(v0, v1); 1972 + try expect(approxEqAbs(v, f32x4(0.0, 0.0, -1.0, 0.0), 0.0001)); 1973 + } 1974 + { 1975 + const v0 = f32x4(-3.0, 0, -2.0, 1.0); 1976 + const v1 = f32x4(5.0, -1.0, 2.0, 1.0); 1977 + const v = cross3(v0, v1); 1978 + try expect(approxEqAbs(v, f32x4(-2.0, -4.0, 3.0, 0.0), 0.0001)); 1979 + } 1980 + } 1981 + 1982 + pub inline fn lengthSq2(v: Vec) F32x4 { 1983 + return dot2(v, v); 1984 + } 1985 + pub inline fn lengthSq3(v: Vec) F32x4 { 1986 + return dot3(v, v); 1987 + } 1988 + pub inline fn lengthSq4(v: Vec) F32x4 { 1989 + return dot4(v, v); 1990 + } 1991 + 1992 + pub inline fn length2(v: Vec) F32x4 { 1993 + return sqrt(dot2(v, v)); 1994 + } 1995 + pub inline fn length3(v: Vec) F32x4 { 1996 + return sqrt(dot3(v, v)); 1997 + } 1998 + pub inline fn length4(v: Vec) F32x4 { 1999 + return sqrt(dot4(v, v)); 2000 + } 2001 + test "zmath.length3" { 2002 + { 2003 + const v = length3(f32x4(1.0, -2.0, 3.0, 1000.0)); 2004 + try expect(approxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001)); 2005 + } 2006 + { 2007 + const v = length3(f32x4(1.0, math.nan(f32), math.nan(f32), 1000.0)); 2008 + try expect(all(isNan(v), 0)); 2009 + } 2010 + { 2011 + const v = length3(f32x4(1.0, math.inf(f32), 3.0, 1000.0)); 2012 + try expect(all(isInf(v), 0)); 2013 + } 2014 + { 2015 + const v = length3(f32x4(3.0, 2.0, 1.0, math.nan(f32))); 2016 + try expect(approxEqAbs(v, splat(F32x4, math.sqrt(14.0)), 0.001)); 2017 + } 2018 + } 2019 + 2020 + pub inline fn normalize2(v: Vec) Vec { 2021 + return v * splat(F32x4, 1.0) / sqrt(dot2(v, v)); 2022 + } 2023 + pub inline fn normalize3(v: Vec) Vec { 2024 + return v * splat(F32x4, 1.0) / sqrt(dot3(v, v)); 2025 + } 2026 + pub inline fn normalize4(v: Vec) Vec { 2027 + return v * splat(F32x4, 1.0) / sqrt(dot4(v, v)); 2028 + } 2029 + test "zmath.normalize3" { 2030 + { 2031 + const v0 = f32x4(1.0, -2.0, 3.0, 1000.0); 2032 + const v = normalize3(v0); 2033 + try expect(approxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(14.0)), 0.0005)); 2034 + } 2035 + { 2036 + try expect(any(isNan(normalize3(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); 2037 + try expect(any(isNan(normalize3(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); 2038 + try expect(any(isNan(normalize3(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); 2039 + try expect(any(isNan(normalize3(f32x4(0, 0, 0, 0))), 0)); 2040 + } 2041 + } 2042 + test "zmath.normalize4" { 2043 + { 2044 + const v0 = f32x4(1.0, -2.0, 3.0, 10.0); 2045 + const v = normalize4(v0); 2046 + try expect(approxEqAbs(v, v0 * splat(F32x4, 1.0 / math.sqrt(114.0)), 0.0005)); 2047 + } 2048 + { 2049 + try expect(any(isNan(normalize4(f32x4(1.0, math.inf(f32), 1.0, 1.0))), 0)); 2050 + try expect(any(isNan(normalize4(f32x4(-math.inf(f32), math.inf(f32), 0.0, 0.0))), 0)); 2051 + try expect(any(isNan(normalize4(f32x4(-math.nan(f32), math.snan(f32), 0.0, 0.0))), 0)); 2052 + try expect(any(isNan(normalize4(f32x4(0, 0, 0, 0))), 0)); 2053 + } 2054 + } 2055 + 2056 + fn vecMulMat(v: Vec, m: Mat) Vec { 2057 + const vx = @shuffle(f32, v, undefined, [4]i32{ 0, 0, 0, 0 }); 2058 + const vy = @shuffle(f32, v, undefined, [4]i32{ 1, 1, 1, 1 }); 2059 + const vz = @shuffle(f32, v, undefined, [4]i32{ 2, 2, 2, 2 }); 2060 + const vw = @shuffle(f32, v, undefined, [4]i32{ 3, 3, 3, 3 }); 2061 + return vx * m[0] + vy * m[1] + vz * m[2] + vw * m[3]; 2062 + } 2063 + fn matMulVec(m: Mat, v: Vec) Vec { 2064 + return .{ dot4(m[0], v)[0], dot4(m[1], v)[0], dot4(m[2], v)[0], dot4(m[3], v)[0] }; 2065 + } 2066 + test "zmath.vecMulMat" { 2067 + const m = Mat{ 2068 + f32x4(1.0, 0.0, 0.0, 0.0), 2069 + f32x4(0.0, 1.0, 0.0, 0.0), 2070 + f32x4(0.0, 0.0, 1.0, 0.0), 2071 + f32x4(2.0, 3.0, 4.0, 1.0), 2072 + }; 2073 + const vm = mul(f32x4(1.0, 2.0, 3.0, 1.0), m); 2074 + const mv = mul(m, f32x4(1.0, 2.0, 3.0, 1.0)); 2075 + const v = mul(transpose(m), f32x4(1.0, 2.0, 3.0, 1.0)); 2076 + try expect(approxEqAbs(vm, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001)); 2077 + try expect(approxEqAbs(mv, f32x4(1.0, 2.0, 3.0, 21.0), 0.0001)); 2078 + try expect(approxEqAbs(v, f32x4(3.0, 5.0, 7.0, 1.0), 0.0001)); 2079 + } 2080 + // ------------------------------------------------------------------------------ 2081 + // 2082 + // 4. Matrix functions 2083 + // 2084 + // ------------------------------------------------------------------------------ 2085 + pub fn identity() Mat { 2086 + const static = struct { 2087 + const identity = Mat{ 2088 + f32x4(1.0, 0.0, 0.0, 0.0), 2089 + f32x4(0.0, 1.0, 0.0, 0.0), 2090 + f32x4(0.0, 0.0, 1.0, 0.0), 2091 + f32x4(0.0, 0.0, 0.0, 1.0), 2092 + }; 2093 + }; 2094 + return static.identity; 2095 + } 2096 + 2097 + pub fn matFromArr(arr: [16]f32) Mat { 2098 + return Mat{ 2099 + f32x4(arr[0], arr[1], arr[2], arr[3]), 2100 + f32x4(arr[4], arr[5], arr[6], arr[7]), 2101 + f32x4(arr[8], arr[9], arr[10], arr[11]), 2102 + f32x4(arr[12], arr[13], arr[14], arr[15]), 2103 + }; 2104 + } 2105 + 2106 + fn mulRetType(comptime Ta: type, comptime Tb: type) type { 2107 + if (Ta == Mat and Tb == Mat) { 2108 + return Mat; 2109 + } else if ((Ta == f32 and Tb == Mat) or (Ta == Mat and Tb == f32)) { 2110 + return Mat; 2111 + } else if ((Ta == Vec and Tb == Mat) or (Ta == Mat and Tb == Vec)) { 2112 + return Vec; 2113 + } 2114 + @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ @typeName(Tb)); 2115 + } 2116 + 2117 + pub fn mul(a: anytype, b: anytype) mulRetType(@TypeOf(a), @TypeOf(b)) { 2118 + const Ta = @TypeOf(a); 2119 + const Tb = @TypeOf(b); 2120 + if (Ta == Mat and Tb == Mat) { 2121 + return mulMat(a, b); 2122 + } else if (Ta == f32 and Tb == Mat) { 2123 + const va = splat(F32x4, a); 2124 + return Mat{ va * b[0], va * b[1], va * b[2], va * b[3] }; 2125 + } else if (Ta == Mat and Tb == f32) { 2126 + const vb = splat(F32x4, b); 2127 + return Mat{ a[0] * vb, a[1] * vb, a[2] * vb, a[3] * vb }; 2128 + } else if (Ta == Vec and Tb == Mat) { 2129 + return vecMulMat(a, b); 2130 + } else if (Ta == Mat and Tb == Vec) { 2131 + return matMulVec(a, b); 2132 + } else { 2133 + @compileError("zmath.mul() not implemented for types: " ++ @typeName(Ta) ++ ", " ++ @typeName(Tb)); 2134 + } 2135 + } 2136 + test "zmath.mul" { 2137 + { 2138 + const m = Mat{ 2139 + f32x4(0.1, 0.2, 0.3, 0.4), 2140 + f32x4(0.5, 0.6, 0.7, 0.8), 2141 + f32x4(0.9, 1.0, 1.1, 1.2), 2142 + f32x4(1.3, 1.4, 1.5, 1.6), 2143 + }; 2144 + const ms = mul(@as(f32, 2.0), m); 2145 + try expect(approxEqAbs(ms[0], f32x4(0.2, 0.4, 0.6, 0.8), 0.0001)); 2146 + try expect(approxEqAbs(ms[1], f32x4(1.0, 1.2, 1.4, 1.6), 0.0001)); 2147 + try expect(approxEqAbs(ms[2], f32x4(1.8, 2.0, 2.2, 2.4), 0.0001)); 2148 + try expect(approxEqAbs(ms[3], f32x4(2.6, 2.8, 3.0, 3.2), 0.0001)); 2149 + } 2150 + } 2151 + 2152 + fn mulMat(m0: Mat, m1: Mat) Mat { 2153 + var result: Mat = undefined; 2154 + comptime var row: u32 = 0; 2155 + inline while (row < 4) : (row += 1) { 2156 + const vx = swizzle(m0[row], .x, .x, .x, .x); 2157 + const vy = swizzle(m0[row], .y, .y, .y, .y); 2158 + const vz = swizzle(m0[row], .z, .z, .z, .z); 2159 + const vw = swizzle(m0[row], .w, .w, .w, .w); 2160 + result[row] = mulAdd(vx, m1[0], vz * m1[2]) + mulAdd(vy, m1[1], vw * m1[3]); 2161 + } 2162 + return result; 2163 + } 2164 + test "zmath.matrix.mul" { 2165 + const a = Mat{ 2166 + f32x4(0.1, 0.2, 0.3, 0.4), 2167 + f32x4(0.5, 0.6, 0.7, 0.8), 2168 + f32x4(0.9, 1.0, 1.1, 1.2), 2169 + f32x4(1.3, 1.4, 1.5, 1.6), 2170 + }; 2171 + const b = Mat{ 2172 + f32x4(1.7, 1.8, 1.9, 2.0), 2173 + f32x4(2.1, 2.2, 2.3, 2.4), 2174 + f32x4(2.5, 2.6, 2.7, 2.8), 2175 + f32x4(2.9, 3.0, 3.1, 3.2), 2176 + }; 2177 + const c = mul(a, b); 2178 + try expect(approxEqAbs(c[0], f32x4(2.5, 2.6, 2.7, 2.8), 0.0001)); 2179 + try expect(approxEqAbs(c[1], f32x4(6.18, 6.44, 6.7, 6.96), 0.0001)); 2180 + try expect(approxEqAbs(c[2], f32x4(9.86, 10.28, 10.7, 11.12), 0.0001)); 2181 + try expect(approxEqAbs(c[3], f32x4(13.54, 14.12, 14.7, 15.28), 0.0001)); 2182 + } 2183 + 2184 + pub fn transpose(m: Mat) Mat { 2185 + const temp1 = @shuffle(f32, m[0], m[1], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); 2186 + const temp3 = @shuffle(f32, m[0], m[1], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2187 + const temp2 = @shuffle(f32, m[2], m[3], [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 1) }); 2188 + const temp4 = @shuffle(f32, m[2], m[3], [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2189 + return .{ 2190 + @shuffle(f32, temp1, temp2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), 2191 + @shuffle(f32, temp1, temp2, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), 2192 + @shuffle(f32, temp3, temp4, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }), 2193 + @shuffle(f32, temp3, temp4, [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }), 2194 + }; 2195 + } 2196 + test "zmath.matrix.transpose" { 2197 + const m = Mat{ 2198 + f32x4(1.0, 2.0, 3.0, 4.0), 2199 + f32x4(5.0, 6.0, 7.0, 8.0), 2200 + f32x4(9.0, 10.0, 11.0, 12.0), 2201 + f32x4(13.0, 14.0, 15.0, 16.0), 2202 + }; 2203 + const mt = transpose(m); 2204 + try expect(approxEqAbs(mt[0], f32x4(1.0, 5.0, 9.0, 13.0), 0.0001)); 2205 + try expect(approxEqAbs(mt[1], f32x4(2.0, 6.0, 10.0, 14.0), 0.0001)); 2206 + try expect(approxEqAbs(mt[2], f32x4(3.0, 7.0, 11.0, 15.0), 0.0001)); 2207 + try expect(approxEqAbs(mt[3], f32x4(4.0, 8.0, 12.0, 16.0), 0.0001)); 2208 + } 2209 + 2210 + pub fn rotationX(angle: f32) Mat { 2211 + const sc = sincos(angle); 2212 + return .{ 2213 + f32x4(1.0, 0.0, 0.0, 0.0), 2214 + f32x4(0.0, sc[1], sc[0], 0.0), 2215 + f32x4(0.0, -sc[0], sc[1], 0.0), 2216 + f32x4(0.0, 0.0, 0.0, 1.0), 2217 + }; 2218 + } 2219 + 2220 + pub fn rotationY(angle: f32) Mat { 2221 + const sc = sincos(angle); 2222 + return .{ 2223 + f32x4(sc[1], 0.0, -sc[0], 0.0), 2224 + f32x4(0.0, 1.0, 0.0, 0.0), 2225 + f32x4(sc[0], 0.0, sc[1], 0.0), 2226 + f32x4(0.0, 0.0, 0.0, 1.0), 2227 + }; 2228 + } 2229 + 2230 + pub fn rotationZ(angle: f32) Mat { 2231 + const sc = sincos(angle); 2232 + return .{ 2233 + f32x4(sc[1], sc[0], 0.0, 0.0), 2234 + f32x4(-sc[0], sc[1], 0.0, 0.0), 2235 + f32x4(0.0, 0.0, 1.0, 0.0), 2236 + f32x4(0.0, 0.0, 0.0, 1.0), 2237 + }; 2238 + } 2239 + 2240 + pub fn translation(x: f32, y: f32, z: f32) Mat { 2241 + return .{ 2242 + f32x4(1.0, 0.0, 0.0, 0.0), 2243 + f32x4(0.0, 1.0, 0.0, 0.0), 2244 + f32x4(0.0, 0.0, 1.0, 0.0), 2245 + f32x4(x, y, z, 1.0), 2246 + }; 2247 + } 2248 + pub fn translationV(v: Vec) Mat { 2249 + return translation(v[0], v[1], v[2]); 2250 + } 2251 + 2252 + pub fn scaling(x: f32, y: f32, z: f32) Mat { 2253 + return .{ 2254 + f32x4(x, 0.0, 0.0, 0.0), 2255 + f32x4(0.0, y, 0.0, 0.0), 2256 + f32x4(0.0, 0.0, z, 0.0), 2257 + f32x4(0.0, 0.0, 0.0, 1.0), 2258 + }; 2259 + } 2260 + pub fn scalingV(v: Vec) Mat { 2261 + return scaling(v[0], v[1], v[2]); 2262 + } 2263 + 2264 + pub fn lookToLh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { 2265 + const az = normalize3(eyedir); 2266 + const ax = normalize3(cross3(updir, az)); 2267 + const ay = normalize3(cross3(az, ax)); 2268 + return transpose(.{ 2269 + f32x4(ax[0], ax[1], ax[2], -dot3(ax, eyepos)[0]), 2270 + f32x4(ay[0], ay[1], ay[2], -dot3(ay, eyepos)[0]), 2271 + f32x4(az[0], az[1], az[2], -dot3(az, eyepos)[0]), 2272 + f32x4(0.0, 0.0, 0.0, 1.0), 2273 + }); 2274 + } 2275 + pub fn lookToRh(eyepos: Vec, eyedir: Vec, updir: Vec) Mat { 2276 + return lookToLh(eyepos, -eyedir, updir); 2277 + } 2278 + pub fn lookAtLh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { 2279 + return lookToLh(eyepos, focuspos - eyepos, updir); 2280 + } 2281 + pub fn lookAtRh(eyepos: Vec, focuspos: Vec, updir: Vec) Mat { 2282 + return lookToLh(eyepos, eyepos - focuspos, updir); 2283 + } 2284 + test "zmath.matrix.lookToLh" { 2285 + const m = lookToLh(f32x4(0.0, 0.0, -3.0, 1.0), f32x4(0.0, 0.0, 1.0, 0.0), f32x4(0.0, 1.0, 0.0, 0.0)); 2286 + try expect(approxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.001)); 2287 + try expect(approxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.001)); 2288 + try expect(approxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.001)); 2289 + try expect(approxEqAbs(m[3], f32x4(0.0, 0.0, 3.0, 1.0), 0.001)); 2290 + } 2291 + 2292 + pub fn perspectiveFovLh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2293 + const scfov = sincos(0.5 * fovy); 2294 + 2295 + assert(near > 0.0 and far > 0.0); 2296 + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2297 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2298 + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2299 + 2300 + const h = scfov[1] / scfov[0]; 2301 + const w = h / aspect; 2302 + const r = far / (far - near); 2303 + return .{ 2304 + f32x4(w, 0.0, 0.0, 0.0), 2305 + f32x4(0.0, h, 0.0, 0.0), 2306 + f32x4(0.0, 0.0, r, 1.0), 2307 + f32x4(0.0, 0.0, -r * near, 0.0), 2308 + }; 2309 + } 2310 + pub fn perspectiveFovRh(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2311 + const scfov = sincos(0.5 * fovy); 2312 + 2313 + assert(near > 0.0 and far > 0.0); 2314 + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2315 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2316 + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2317 + 2318 + const h = scfov[1] / scfov[0]; 2319 + const w = h / aspect; 2320 + const r = far / (near - far); 2321 + return .{ 2322 + f32x4(w, 0.0, 0.0, 0.0), 2323 + f32x4(0.0, h, 0.0, 0.0), 2324 + f32x4(0.0, 0.0, r, -1.0), 2325 + f32x4(0.0, 0.0, r * near, 0.0), 2326 + }; 2327 + } 2328 + 2329 + // Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2330 + pub fn perspectiveFovLhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2331 + const scfov = sincos(0.5 * fovy); 2332 + 2333 + assert(near > 0.0 and far > 0.0); 2334 + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2335 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2336 + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2337 + 2338 + const h = scfov[1] / scfov[0]; 2339 + const w = h / aspect; 2340 + const r = far - near; 2341 + return .{ 2342 + f32x4(w, 0.0, 0.0, 0.0), 2343 + f32x4(0.0, h, 0.0, 0.0), 2344 + f32x4(0.0, 0.0, (near + far) / r, 1.0), 2345 + f32x4(0.0, 0.0, 2.0 * near * far / -r, 0.0), 2346 + }; 2347 + } 2348 + 2349 + // Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2350 + pub fn perspectiveFovRhGl(fovy: f32, aspect: f32, near: f32, far: f32) Mat { 2351 + const scfov = sincos(0.5 * fovy); 2352 + 2353 + assert(near > 0.0 and far > 0.0); 2354 + assert(!math.approxEqAbs(f32, scfov[0], 0.0, 0.001)); 2355 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2356 + assert(!math.approxEqAbs(f32, aspect, 0.0, 0.01)); 2357 + 2358 + const h = scfov[1] / scfov[0]; 2359 + const w = h / aspect; 2360 + const r = near - far; 2361 + return .{ 2362 + f32x4(w, 0.0, 0.0, 0.0), 2363 + f32x4(0.0, h, 0.0, 0.0), 2364 + f32x4(0.0, 0.0, (near + far) / r, -1.0), 2365 + f32x4(0.0, 0.0, 2.0 * near * far / r, 0.0), 2366 + }; 2367 + } 2368 + 2369 + pub fn orthographicLh(w: f32, h: f32, near: f32, far: f32) Mat { 2370 + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2371 + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2372 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2373 + 2374 + const r = 1 / (far - near); 2375 + return .{ 2376 + f32x4(2 / w, 0.0, 0.0, 0.0), 2377 + f32x4(0.0, 2 / h, 0.0, 0.0), 2378 + f32x4(0.0, 0.0, r, 0.0), 2379 + f32x4(0.0, 0.0, -r * near, 1.0), 2380 + }; 2381 + } 2382 + 2383 + pub fn orthographicRh(w: f32, h: f32, near: f32, far: f32) Mat { 2384 + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2385 + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2386 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2387 + 2388 + const r = 1 / (near - far); 2389 + return .{ 2390 + f32x4(2 / w, 0.0, 0.0, 0.0), 2391 + f32x4(0.0, 2 / h, 0.0, 0.0), 2392 + f32x4(0.0, 0.0, r, 0.0), 2393 + f32x4(0.0, 0.0, r * near, 1.0), 2394 + }; 2395 + } 2396 + 2397 + // Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2398 + pub fn orthographicLhGl(w: f32, h: f32, near: f32, far: f32) Mat { 2399 + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2400 + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2401 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2402 + 2403 + const r = far - near; 2404 + return .{ 2405 + f32x4(2 / w, 0.0, 0.0, 0.0), 2406 + f32x4(0.0, 2 / h, 0.0, 0.0), 2407 + f32x4(0.0, 0.0, 2 / r, 0.0), 2408 + f32x4(0.0, 0.0, (near + far) / -r, 1.0), 2409 + }; 2410 + } 2411 + 2412 + // Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2413 + pub fn orthographicRhGl(w: f32, h: f32, near: f32, far: f32) Mat { 2414 + assert(!math.approxEqAbs(f32, w, 0.0, 0.001)); 2415 + assert(!math.approxEqAbs(f32, h, 0.0, 0.001)); 2416 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2417 + 2418 + const r = near - far; 2419 + return .{ 2420 + f32x4(2 / w, 0.0, 0.0, 0.0), 2421 + f32x4(0.0, 2 / h, 0.0, 0.0), 2422 + f32x4(0.0, 0.0, 2 / r, 0.0), 2423 + f32x4(0.0, 0.0, (near + far) / r, 1.0), 2424 + }; 2425 + } 2426 + 2427 + pub fn orthographicOffCenterLh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2428 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2429 + 2430 + const r = 1 / (far - near); 2431 + return .{ 2432 + f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2433 + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2434 + f32x4(0.0, 0.0, r, 0.0), 2435 + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), -r * near, 1.0), 2436 + }; 2437 + } 2438 + 2439 + pub fn orthographicOffCenterRh(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2440 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2441 + 2442 + const r = 1 / (near - far); 2443 + return .{ 2444 + f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2445 + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2446 + f32x4(0.0, 0.0, r, 0.0), 2447 + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), r * near, 1.0), 2448 + }; 2449 + } 2450 + 2451 + // Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2452 + pub fn orthographicOffCenterLhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2453 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2454 + 2455 + const r = far - near; 2456 + return .{ 2457 + f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2458 + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2459 + f32x4(0.0, 0.0, 2 / r, 0.0), 2460 + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / -r, 1.0), 2461 + }; 2462 + } 2463 + 2464 + // Produces Z values in [-1.0, 1.0] range (OpenGL defaults) 2465 + pub fn orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, near: f32, far: f32) Mat { 2466 + assert(!math.approxEqAbs(f32, far, near, 0.001)); 2467 + 2468 + const r = near - far; 2469 + return .{ 2470 + f32x4(2 / (right - left), 0.0, 0.0, 0.0), 2471 + f32x4(0.0, 2 / (top - bottom), 0.0, 0.0), 2472 + f32x4(0.0, 0.0, 2 / r, 0.0), 2473 + f32x4(-(right + left) / (right - left), -(top + bottom) / (top - bottom), (near + far) / r, 1.0), 2474 + }; 2475 + } 2476 + 2477 + pub fn determinant(m: Mat) F32x4 { 2478 + var v0 = swizzle(m[2], .y, .x, .x, .x); 2479 + var v1 = swizzle(m[3], .z, .z, .y, .y); 2480 + var v2 = swizzle(m[2], .y, .x, .x, .x); 2481 + var v3 = swizzle(m[3], .w, .w, .w, .z); 2482 + var v4 = swizzle(m[2], .z, .z, .y, .y); 2483 + var v5 = swizzle(m[3], .w, .w, .w, .z); 2484 + 2485 + var p0 = v0 * v1; 2486 + var p1 = v2 * v3; 2487 + var p2 = v4 * v5; 2488 + 2489 + v0 = swizzle(m[2], .z, .z, .y, .y); 2490 + v1 = swizzle(m[3], .y, .x, .x, .x); 2491 + v2 = swizzle(m[2], .w, .w, .w, .z); 2492 + v3 = swizzle(m[3], .y, .x, .x, .x); 2493 + v4 = swizzle(m[2], .w, .w, .w, .z); 2494 + v5 = swizzle(m[3], .z, .z, .y, .y); 2495 + 2496 + p0 = mulAdd(-v0, v1, p0); 2497 + p1 = mulAdd(-v2, v3, p1); 2498 + p2 = mulAdd(-v4, v5, p2); 2499 + 2500 + v0 = swizzle(m[1], .w, .w, .w, .z); 2501 + v1 = swizzle(m[1], .z, .z, .y, .y); 2502 + v2 = swizzle(m[1], .y, .x, .x, .x); 2503 + 2504 + const s = m[0] * f32x4(1.0, -1.0, 1.0, -1.0); 2505 + var r = v0 * p0; 2506 + r = mulAdd(-v1, p1, r); 2507 + r = mulAdd(v2, p2, r); 2508 + return dot4(s, r); 2509 + } 2510 + test "zmath.matrix.determinant" { 2511 + const m = Mat{ 2512 + f32x4(10.0, -9.0, -12.0, 1.0), 2513 + f32x4(7.0, -12.0, 11.0, 1.0), 2514 + f32x4(-10.0, 10.0, 3.0, 1.0), 2515 + f32x4(1.0, 2.0, 3.0, 4.0), 2516 + }; 2517 + try expect(approxEqAbs(determinant(m), splat(F32x4, 2939.0), 0.0001)); 2518 + } 2519 + 2520 + pub fn inverse(a: anytype) @TypeOf(a) { 2521 + const T = @TypeOf(a); 2522 + return switch (T) { 2523 + Mat => inverseMat(a), 2524 + Quat => inverseQuat(a), 2525 + else => @compileError("zmath.inverse() not implemented for " ++ @typeName(T)), 2526 + }; 2527 + } 2528 + 2529 + fn inverseMat(m: Mat) Mat { 2530 + return inverseDet(m, null); 2531 + } 2532 + 2533 + pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { 2534 + const mt = transpose(m); 2535 + var v0: [4]F32x4 = undefined; 2536 + var v1: [4]F32x4 = undefined; 2537 + 2538 + v0[0] = swizzle(mt[2], .x, .x, .y, .y); 2539 + v1[0] = swizzle(mt[3], .z, .w, .z, .w); 2540 + v0[1] = swizzle(mt[0], .x, .x, .y, .y); 2541 + v1[1] = swizzle(mt[1], .z, .w, .z, .w); 2542 + v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 2543 + v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 2544 + 2545 + var d0 = v0[0] * v1[0]; 2546 + var d1 = v0[1] * v1[1]; 2547 + var d2 = v0[2] * v1[2]; 2548 + 2549 + v0[0] = swizzle(mt[2], .z, .w, .z, .w); 2550 + v1[0] = swizzle(mt[3], .x, .x, .y, .y); 2551 + v0[1] = swizzle(mt[0], .z, .w, .z, .w); 2552 + v1[1] = swizzle(mt[1], .x, .x, .y, .y); 2553 + v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 2554 + v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 2555 + 2556 + d0 = mulAdd(-v0[0], v1[0], d0); 2557 + d1 = mulAdd(-v0[1], v1[1], d1); 2558 + d2 = mulAdd(-v0[2], v1[2], d2); 2559 + 2560 + v0[0] = swizzle(mt[1], .y, .z, .x, .y); 2561 + v1[0] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 1, 3, 0 }); 2562 + v0[1] = swizzle(mt[0], .z, .x, .y, .x); 2563 + v1[1] = @shuffle(f32, d0, d2, [4]i32{ 3, ~@as(i32, 1), 1, 2 }); 2564 + v0[2] = swizzle(mt[3], .y, .z, .x, .y); 2565 + v1[2] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 1, 3, 0 }); 2566 + v0[3] = swizzle(mt[2], .z, .x, .y, .x); 2567 + v1[3] = @shuffle(f32, d1, d2, [4]i32{ 3, ~@as(i32, 3), 1, 2 }); 2568 + 2569 + var c0 = v0[0] * v1[0]; 2570 + var c2 = v0[1] * v1[1]; 2571 + var c4 = v0[2] * v1[2]; 2572 + var c6 = v0[3] * v1[3]; 2573 + 2574 + v0[0] = swizzle(mt[1], .z, .w, .y, .z); 2575 + v1[0] = @shuffle(f32, d0, d2, [4]i32{ 3, 0, 1, ~@as(i32, 0) }); 2576 + v0[1] = swizzle(mt[0], .w, .z, .w, .y); 2577 + v1[1] = @shuffle(f32, d0, d2, [4]i32{ 2, 1, ~@as(i32, 0), 0 }); 2578 + v0[2] = swizzle(mt[3], .z, .w, .y, .z); 2579 + v1[2] = @shuffle(f32, d1, d2, [4]i32{ 3, 0, 1, ~@as(i32, 2) }); 2580 + v0[3] = swizzle(mt[2], .w, .z, .w, .y); 2581 + v1[3] = @shuffle(f32, d1, d2, [4]i32{ 2, 1, ~@as(i32, 2), 0 }); 2582 + 2583 + c0 = mulAdd(-v0[0], v1[0], c0); 2584 + c2 = mulAdd(-v0[1], v1[1], c2); 2585 + c4 = mulAdd(-v0[2], v1[2], c4); 2586 + c6 = mulAdd(-v0[3], v1[3], c6); 2587 + 2588 + v0[0] = swizzle(mt[1], .w, .x, .w, .x); 2589 + v1[0] = @shuffle(f32, d0, d2, [4]i32{ 2, ~@as(i32, 1), ~@as(i32, 0), 2 }); 2590 + v0[1] = swizzle(mt[0], .y, .w, .x, .z); 2591 + v1[1] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 0, 3, ~@as(i32, 0) }); 2592 + v0[2] = swizzle(mt[3], .w, .x, .w, .x); 2593 + v1[2] = @shuffle(f32, d1, d2, [4]i32{ 2, ~@as(i32, 3), ~@as(i32, 2), 2 }); 2594 + v0[3] = swizzle(mt[2], .y, .w, .x, .z); 2595 + v1[3] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 0, 3, ~@as(i32, 2) }); 2596 + 2597 + const c1 = mulAdd(-v0[0], v1[0], c0); 2598 + const c3 = mulAdd(v0[1], v1[1], c2); 2599 + const c5 = mulAdd(-v0[2], v1[2], c4); 2600 + const c7 = mulAdd(v0[3], v1[3], c6); 2601 + 2602 + c0 = mulAdd(v0[0], v1[0], c0); 2603 + c2 = mulAdd(-v0[1], v1[1], c2); 2604 + c4 = mulAdd(v0[2], v1[2], c4); 2605 + c6 = mulAdd(-v0[3], v1[3], c6); 2606 + 2607 + var mr = Mat{ 2608 + f32x4(c0[0], c1[1], c0[2], c1[3]), 2609 + f32x4(c2[0], c3[1], c2[2], c3[3]), 2610 + f32x4(c4[0], c5[1], c4[2], c5[3]), 2611 + f32x4(c6[0], c7[1], c6[2], c7[3]), 2612 + }; 2613 + 2614 + const det = dot4(mr[0], mt[0]); 2615 + if (out_det != null) { 2616 + out_det.?.* = det; 2617 + } 2618 + 2619 + if (math.approxEqAbs(f32, det[0], 0.0, math.floatEps(f32))) { 2620 + return .{ 2621 + f32x4(0.0, 0.0, 0.0, 0.0), 2622 + f32x4(0.0, 0.0, 0.0, 0.0), 2623 + f32x4(0.0, 0.0, 0.0, 0.0), 2624 + f32x4(0.0, 0.0, 0.0, 0.0), 2625 + }; 2626 + } 2627 + 2628 + const scale = splat(F32x4, 1.0) / det; 2629 + mr[0] *= scale; 2630 + mr[1] *= scale; 2631 + mr[2] *= scale; 2632 + mr[3] *= scale; 2633 + return mr; 2634 + } 2635 + test "zmath.matrix.inverse" { 2636 + const m = Mat{ 2637 + f32x4(10.0, -9.0, -12.0, 1.0), 2638 + f32x4(7.0, -12.0, 11.0, 1.0), 2639 + f32x4(-10.0, 10.0, 3.0, 1.0), 2640 + f32x4(1.0, 2.0, 3.0, 4.0), 2641 + }; 2642 + var det: F32x4 = undefined; 2643 + const mi = inverseDet(m, &det); 2644 + try expect(approxEqAbs(det, splat(F32x4, 2939.0), 0.0001)); 2645 + 2646 + try expect(approxEqAbs(mi[0], f32x4(-0.170806, -0.13576, -0.349439, 0.164001), 0.0001)); 2647 + try expect(approxEqAbs(mi[1], f32x4(-0.163661, -0.14801, -0.253147, 0.141204), 0.0001)); 2648 + try expect(approxEqAbs(mi[2], f32x4(-0.0871045, 0.00646478, -0.0785982, 0.0398095), 0.0001)); 2649 + try expect(approxEqAbs(mi[3], f32x4(0.18986, 0.103096, 0.272882, 0.10854), 0.0001)); 2650 + } 2651 + 2652 + pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat { 2653 + const sincos_angle = sincos(angle); 2654 + 2655 + const c2 = splat(F32x4, 1.0 - sincos_angle[1]); 2656 + const c1 = splat(F32x4, sincos_angle[1]); 2657 + const c0 = splat(F32x4, sincos_angle[0]); 2658 + 2659 + const n0 = swizzle(axis, .y, .z, .x, .w); 2660 + const n1 = swizzle(axis, .z, .x, .y, .w); 2661 + 2662 + var v0 = c2 * n0 * n1; 2663 + const r0 = c2 * axis * axis + c1; 2664 + const r1 = c0 * axis + v0; 2665 + var r2 = v0 - c0 * axis; 2666 + 2667 + v0 = andInt(r0, f32x4_mask3); 2668 + 2669 + var v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 2, ~@as(i32, 1), ~@as(i32, 2) }); 2670 + v1 = swizzle(v1, .y, .z, .w, .x); 2671 + 2672 + var v2 = @shuffle(f32, r1, r2, [4]i32{ 1, 1, ~@as(i32, 0), ~@as(i32, 0) }); 2673 + v2 = swizzle(v2, .x, .z, .x, .z); 2674 + 2675 + r2 = @shuffle(f32, v0, v1, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); 2676 + r2 = swizzle(r2, .x, .z, .w, .y); 2677 + 2678 + var m: Mat = undefined; 2679 + m[0] = r2; 2680 + 2681 + r2 = @shuffle(f32, v0, v1, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2682 + r2 = swizzle(r2, .z, .x, .w, .y); 2683 + m[1] = r2; 2684 + 2685 + v2 = @shuffle(f32, v2, v0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); 2686 + m[2] = v2; 2687 + m[3] = f32x4(0.0, 0.0, 0.0, 1.0); 2688 + return m; 2689 + } 2690 + pub fn matFromAxisAngle(axis: Vec, angle: f32) Mat { 2691 + assert(!all(axis == splat(F32x4, 0.0), 3)); 2692 + assert(!all(isInf(axis), 3)); 2693 + const normal = normalize3(axis); 2694 + return matFromNormAxisAngle(normal, angle); 2695 + } 2696 + test "zmath.matrix.matFromAxisAngle" { 2697 + { 2698 + const m0 = matFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), math.pi * 0.25); 2699 + const m1 = rotationX(math.pi * 0.25); 2700 + try expect(approxEqAbs(m0[0], m1[0], 0.001)); 2701 + try expect(approxEqAbs(m0[1], m1[1], 0.001)); 2702 + try expect(approxEqAbs(m0[2], m1[2], 0.001)); 2703 + try expect(approxEqAbs(m0[3], m1[3], 0.001)); 2704 + } 2705 + { 2706 + const m0 = matFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), math.pi * 0.125); 2707 + const m1 = rotationY(math.pi * 0.125); 2708 + try expect(approxEqAbs(m0[0], m1[0], 0.001)); 2709 + try expect(approxEqAbs(m0[1], m1[1], 0.001)); 2710 + try expect(approxEqAbs(m0[2], m1[2], 0.001)); 2711 + try expect(approxEqAbs(m0[3], m1[3], 0.001)); 2712 + } 2713 + { 2714 + const m0 = matFromAxisAngle(f32x4(0.0, 0.0, 1.0, 0.0), math.pi * 0.333); 2715 + const m1 = rotationZ(math.pi * 0.333); 2716 + try expect(approxEqAbs(m0[0], m1[0], 0.001)); 2717 + try expect(approxEqAbs(m0[1], m1[1], 0.001)); 2718 + try expect(approxEqAbs(m0[2], m1[2], 0.001)); 2719 + try expect(approxEqAbs(m0[3], m1[3], 0.001)); 2720 + } 2721 + } 2722 + 2723 + pub fn matFromQuat(quat: Quat) Mat { 2724 + const q0 = quat + quat; 2725 + var q1 = quat * q0; 2726 + 2727 + var v0 = swizzle(q1, .y, .x, .x, .w); 2728 + v0 = andInt(v0, f32x4_mask3); 2729 + 2730 + var v1 = swizzle(q1, .z, .z, .y, .w); 2731 + v1 = andInt(v1, f32x4_mask3); 2732 + 2733 + const r0 = (f32x4(1.0, 1.0, 1.0, 0.0) - v0) - v1; 2734 + 2735 + v0 = swizzle(quat, .x, .x, .y, .w); 2736 + v1 = swizzle(q0, .z, .y, .z, .w); 2737 + v0 = v0 * v1; 2738 + 2739 + v1 = swizzle(quat, .w, .w, .w, .w); 2740 + const v2 = swizzle(q0, .y, .z, .x, .w); 2741 + v1 = v1 * v2; 2742 + 2743 + const r1 = v0 + v1; 2744 + const r2 = v0 - v1; 2745 + 2746 + v0 = @shuffle(f32, r1, r2, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); 2747 + v0 = swizzle(v0, .x, .z, .w, .y); 2748 + v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 2), ~@as(i32, 2) }); 2749 + v1 = swizzle(v1, .x, .z, .x, .z); 2750 + 2751 + q1 = @shuffle(f32, r0, v0, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); 2752 + q1 = swizzle(q1, .x, .z, .w, .y); 2753 + 2754 + var m: Mat = undefined; 2755 + m[0] = q1; 2756 + 2757 + q1 = @shuffle(f32, r0, v0, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); 2758 + q1 = swizzle(q1, .z, .x, .w, .y); 2759 + m[1] = q1; 2760 + 2761 + q1 = @shuffle(f32, v1, r0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); 2762 + m[2] = q1; 2763 + m[3] = f32x4(0.0, 0.0, 0.0, 1.0); 2764 + return m; 2765 + } 2766 + test "zmath.matrix.matFromQuat" { 2767 + { 2768 + const m = matFromQuat(f32x4(0.0, 0.0, 0.0, 1.0)); 2769 + try expect(approxEqAbs(m[0], f32x4(1.0, 0.0, 0.0, 0.0), 0.0001)); 2770 + try expect(approxEqAbs(m[1], f32x4(0.0, 1.0, 0.0, 0.0), 0.0001)); 2771 + try expect(approxEqAbs(m[2], f32x4(0.0, 0.0, 1.0, 0.0), 0.0001)); 2772 + try expect(approxEqAbs(m[3], f32x4(0.0, 0.0, 0.0, 1.0), 0.0001)); 2773 + } 2774 + } 2775 + 2776 + pub fn matFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Mat { 2777 + return matFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); 2778 + } 2779 + pub fn matFromRollPitchYawV(angles: Vec) Mat { 2780 + return matFromQuat(quatFromRollPitchYawV(angles)); 2781 + } 2782 + 2783 + pub fn matToQuat(m: Mat) Quat { 2784 + return quatFromMat(m); 2785 + } 2786 + 2787 + pub inline fn loadMat(mem: []const f32) Mat { 2788 + return .{ 2789 + load(mem[0..4], F32x4, 0), 2790 + load(mem[4..8], F32x4, 0), 2791 + load(mem[8..12], F32x4, 0), 2792 + load(mem[12..16], F32x4, 0), 2793 + }; 2794 + } 2795 + test "zmath.loadMat" { 2796 + const a = [18]f32{ 2797 + 1.0, 2.0, 3.0, 4.0, 2798 + 5.0, 6.0, 7.0, 8.0, 2799 + 9.0, 10.0, 11.0, 12.0, 2800 + 13.0, 14.0, 15.0, 16.0, 2801 + 17.0, 18.0, 2802 + }; 2803 + const m = loadMat(a[1..]); 2804 + try expect(approxEqAbs(m[0], f32x4(2.0, 3.0, 4.0, 5.0), 0.0)); 2805 + try expect(approxEqAbs(m[1], f32x4(6.0, 7.0, 8.0, 9.0), 0.0)); 2806 + try expect(approxEqAbs(m[2], f32x4(10.0, 11.0, 12.0, 13.0), 0.0)); 2807 + try expect(approxEqAbs(m[3], f32x4(14.0, 15.0, 16.0, 17.0), 0.0)); 2808 + } 2809 + 2810 + pub inline fn storeMat(mem: []f32, m: Mat) void { 2811 + store(mem[0..4], m[0], 0); 2812 + store(mem[4..8], m[1], 0); 2813 + store(mem[8..12], m[2], 0); 2814 + store(mem[12..16], m[3], 0); 2815 + } 2816 + 2817 + pub inline fn loadMat43(mem: []const f32) Mat { 2818 + return .{ 2819 + f32x4(mem[0], mem[1], mem[2], 0.0), 2820 + f32x4(mem[3], mem[4], mem[5], 0.0), 2821 + f32x4(mem[6], mem[7], mem[8], 0.0), 2822 + f32x4(mem[9], mem[10], mem[11], 1.0), 2823 + }; 2824 + } 2825 + 2826 + pub inline fn storeMat43(mem: []f32, m: Mat) void { 2827 + store(mem[0..3], m[0], 3); 2828 + store(mem[3..6], m[1], 3); 2829 + store(mem[6..9], m[2], 3); 2830 + store(mem[9..12], m[3], 3); 2831 + } 2832 + 2833 + pub inline fn loadMat34(mem: []const f32) Mat { 2834 + return .{ 2835 + load(mem[0..4], F32x4, 0), 2836 + load(mem[4..8], F32x4, 0), 2837 + load(mem[8..12], F32x4, 0), 2838 + f32x4(0.0, 0.0, 0.0, 1.0), 2839 + }; 2840 + } 2841 + 2842 + pub inline fn storeMat34(mem: []f32, m: Mat) void { 2843 + store(mem[0..4], m[0], 0); 2844 + store(mem[4..8], m[1], 0); 2845 + store(mem[8..12], m[2], 0); 2846 + } 2847 + 2848 + pub inline fn matToArr(m: Mat) [16]f32 { 2849 + var array: [16]f32 = undefined; 2850 + storeMat(array[0..], m); 2851 + return array; 2852 + } 2853 + 2854 + pub inline fn matToArr43(m: Mat) [12]f32 { 2855 + var array: [12]f32 = undefined; 2856 + storeMat43(array[0..], m); 2857 + return array; 2858 + } 2859 + 2860 + pub inline fn matToArr34(m: Mat) [12]f32 { 2861 + var array: [12]f32 = undefined; 2862 + storeMat34(array[0..], m); 2863 + return array; 2864 + } 2865 + // ------------------------------------------------------------------------------ 2866 + // 2867 + // 5. Quaternion functions 2868 + // 2869 + // ------------------------------------------------------------------------------ 2870 + pub fn qmul(q0: Quat, q1: Quat) Quat { 2871 + var result = swizzle(q1, .w, .w, .w, .w); 2872 + var q1x = swizzle(q1, .x, .x, .x, .x); 2873 + var q1y = swizzle(q1, .y, .y, .y, .y); 2874 + var q1z = swizzle(q1, .z, .z, .z, .z); 2875 + result = result * q0; 2876 + var q0_shuf = swizzle(q0, .w, .z, .y, .x); 2877 + q1x = q1x * q0_shuf; 2878 + q0_shuf = swizzle(q0_shuf, .y, .x, .w, .z); 2879 + result = mulAdd(q1x, f32x4(1.0, -1.0, 1.0, -1.0), result); 2880 + q1y = q1y * q0_shuf; 2881 + q0_shuf = swizzle(q0_shuf, .w, .z, .y, .x); 2882 + q1y = q1y * f32x4(1.0, 1.0, -1.0, -1.0); 2883 + q1z = q1z * q0_shuf; 2884 + q1y = mulAdd(q1z, f32x4(-1.0, 1.0, 1.0, -1.0), q1y); 2885 + return result + q1y; 2886 + } 2887 + test "zmath.quaternion.mul" { 2888 + { 2889 + const q0 = f32x4(2.0, 3.0, 4.0, 1.0); 2890 + const q1 = f32x4(3.0, 2.0, 1.0, 4.0); 2891 + try expect(approxEqAbs(qmul(q0, q1), f32x4(16.0, 4.0, 22.0, -12.0), 0.0001)); 2892 + } 2893 + } 2894 + 2895 + pub fn quatToMat(quat: Quat) Mat { 2896 + return matFromQuat(quat); 2897 + } 2898 + 2899 + pub fn quatToAxisAngle(quat: Quat, axis: *Vec, angle: *f32) void { 2900 + axis.* = quat; 2901 + angle.* = 2.0 * acos(quat[3]); 2902 + } 2903 + test "zmath.quaternion.quatToAxisAngle" { 2904 + { 2905 + const q0 = quatFromNormAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); 2906 + var axis: Vec = f32x4(4.0, 3.0, 2.0, 1.0); 2907 + var angle: f32 = 10.0; 2908 + quatToAxisAngle(q0, &axis, &angle); 2909 + try expect(math.approxEqAbs(f32, axis[0], @sin(@as(f32, 0.25) * math.pi * 0.5), 0.0001)); 2910 + try expect(axis[1] == 0.0); 2911 + try expect(axis[2] == 0.0); 2912 + try expect(math.approxEqAbs(f32, angle, 0.25 * math.pi, 0.0001)); 2913 + } 2914 + } 2915 + 2916 + pub fn quatFromMat(m: Mat) Quat { 2917 + const r0 = m[0]; 2918 + const r1 = m[1]; 2919 + const r2 = m[2]; 2920 + const r00 = swizzle(r0, .x, .x, .x, .x); 2921 + const r11 = swizzle(r1, .y, .y, .y, .y); 2922 + const r22 = swizzle(r2, .z, .z, .z, .z); 2923 + 2924 + const x2gey2 = (r11 - r00) <= splat(F32x4, 0.0); 2925 + const z2gew2 = (r11 + r00) <= splat(F32x4, 0.0); 2926 + const x2py2gez2pw2 = r22 <= splat(F32x4, 0.0); 2927 + 2928 + var t0 = mulAdd(r00, f32x4(1.0, -1.0, -1.0, 1.0), splat(F32x4, 1.0)); 2929 + var t1 = r11 * f32x4(-1.0, 1.0, -1.0, 1.0); 2930 + var t2 = mulAdd(r22, f32x4(-1.0, -1.0, 1.0, 1.0), t0); 2931 + const x2y2z2w2 = t1 + t2; 2932 + 2933 + t0 = @shuffle(f32, r0, r1, [4]i32{ 1, 2, ~@as(i32, 2), ~@as(i32, 1) }); 2934 + t1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 0), ~@as(i32, 1) }); 2935 + t1 = swizzle(t1, .x, .z, .w, .y); 2936 + const xyxzyz = t0 + t1; 2937 + 2938 + t0 = @shuffle(f32, r2, r1, [4]i32{ 1, 0, ~@as(i32, 0), ~@as(i32, 0) }); 2939 + t1 = @shuffle(f32, r1, r0, [4]i32{ 2, 2, ~@as(i32, 2), ~@as(i32, 1) }); 2940 + t1 = swizzle(t1, .x, .z, .w, .y); 2941 + const xwywzw = (t0 - t1) * f32x4(-1.0, 1.0, -1.0, 1.0); 2942 + 2943 + t0 = @shuffle(f32, x2y2z2w2, xyxzyz, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 0) }); 2944 + t1 = @shuffle(f32, x2y2z2w2, xwywzw, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 0) }); 2945 + t2 = @shuffle(f32, xyxzyz, xwywzw, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); 2946 + 2947 + const tensor0 = @shuffle(f32, t0, t2, [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 2948 + const tensor1 = @shuffle(f32, t0, t2, [4]i32{ 2, 1, ~@as(i32, 1), ~@as(i32, 3) }); 2949 + const tensor2 = @shuffle(f32, t2, t1, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 2) }); 2950 + const tensor3 = @shuffle(f32, t2, t1, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 1) }); 2951 + 2952 + t0 = select(x2gey2, tensor0, tensor1); 2953 + t1 = select(z2gew2, tensor2, tensor3); 2954 + t2 = select(x2py2gez2pw2, t0, t1); 2955 + 2956 + return t2 / length4(t2); 2957 + } 2958 + test "zmath.quatFromMat" { 2959 + { 2960 + const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); 2961 + const q1 = quatFromMat(rotationX(0.25 * math.pi)); 2962 + try expect(approxEqAbs(q0, q1, 0.0001)); 2963 + } 2964 + { 2965 + const q0 = quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); 2966 + const q1 = quatFromMat(matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); 2967 + try expect(approxEqAbs(q0, q1, 0.0001)); 2968 + } 2969 + { 2970 + const q0 = quatFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi); 2971 + const q1 = quatFromMat(matFromRollPitchYaw(0.1 * math.pi, -0.2 * math.pi, 0.3 * math.pi)); 2972 + try expect(approxEqAbs(q0, q1, 0.0001)); 2973 + } 2974 + } 2975 + 2976 + pub fn quatFromNormAxisAngle(axis: Vec, angle: f32) Quat { 2977 + const n = f32x4(axis[0], axis[1], axis[2], 1.0); 2978 + const sc = sincos(0.5 * angle); 2979 + return n * f32x4(sc[0], sc[0], sc[0], sc[1]); 2980 + } 2981 + pub fn quatFromAxisAngle(axis: Vec, angle: f32) Quat { 2982 + assert(!all(axis == splat(F32x4, 0.0), 3)); 2983 + assert(!all(isInf(axis), 3)); 2984 + const normal = normalize3(axis); 2985 + return quatFromNormAxisAngle(normal, angle); 2986 + } 2987 + test "zmath.quaternion.quatFromNormAxisAngle" { 2988 + { 2989 + const q0 = quatFromAxisAngle(f32x4(1.0, 0.0, 0.0, 0.0), 0.25 * math.pi); 2990 + const q1 = quatFromAxisAngle(f32x4(0.0, 1.0, 0.0, 0.0), 0.125 * math.pi); 2991 + const m0 = rotationX(0.25 * math.pi); 2992 + const m1 = rotationY(0.125 * math.pi); 2993 + const mr0 = quatToMat(qmul(q0, q1)); 2994 + const mr1 = mul(m0, m1); 2995 + try expect(approxEqAbs(mr0[0], mr1[0], 0.0001)); 2996 + try expect(approxEqAbs(mr0[1], mr1[1], 0.0001)); 2997 + try expect(approxEqAbs(mr0[2], mr1[2], 0.0001)); 2998 + try expect(approxEqAbs(mr0[3], mr1[3], 0.0001)); 2999 + } 3000 + { 3001 + const m0 = quatToMat(quatFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi)); 3002 + const m1 = matFromAxisAngle(f32x4(1.0, 2.0, 0.5, 0.0), 0.25 * math.pi); 3003 + try expect(approxEqAbs(m0[0], m1[0], 0.0001)); 3004 + try expect(approxEqAbs(m0[1], m1[1], 0.0001)); 3005 + try expect(approxEqAbs(m0[2], m1[2], 0.0001)); 3006 + try expect(approxEqAbs(m0[3], m1[3], 0.0001)); 3007 + } 3008 + } 3009 + 3010 + pub inline fn qidentity() Quat { 3011 + return f32x4(@as(f32, 0.0), @as(f32, 0.0), @as(f32, 0.0), @as(f32, 1.0)); 3012 + } 3013 + 3014 + pub inline fn conjugate(quat: Quat) Quat { 3015 + return quat * f32x4(-1.0, -1.0, -1.0, 1.0); 3016 + } 3017 + 3018 + fn inverseQuat(quat: Quat) Quat { 3019 + const l = lengthSq4(quat); 3020 + const conj = conjugate(quat); 3021 + return select(l <= splat(F32x4, math.floatEps(f32)), splat(F32x4, 0.0), conj / l); 3022 + } 3023 + test "zmath.quaternion.inverseQuat" { 3024 + try expect(approxEqAbs( 3025 + inverse(f32x4(2.0, 3.0, 4.0, 1.0)), 3026 + f32x4(-1.0 / 15.0, -1.0 / 10.0, -2.0 / 15.0, 1.0 / 30.0), 3027 + 0.0001, 3028 + )); 3029 + try expect(approxEqAbs(inverse(qidentity()), qidentity(), 0.0001)); 3030 + } 3031 + 3032 + // Algorithm from: https://github.com/g-truc/glm/blob/master/glm/detail/type_quat.inl 3033 + pub fn rotate(q: Quat, v: Vec) Vec { 3034 + const w = splat(F32x4, q[3]); 3035 + const axis = f32x4(q[0], q[1], q[2], 0.0); 3036 + const uv = cross3(axis, v); 3037 + return v + ((uv * w) + cross3(axis, uv)) * splat(F32x4, 2.0); 3038 + } 3039 + test "zmath.quaternion.rotate" { 3040 + const quat = quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi); 3041 + const mat = matFromQuat(quat); 3042 + const forward = f32x4(0.0, 0.0, -1.0, 0.0); 3043 + const up = f32x4(0.0, 1.0, 0.0, 0.0); 3044 + const right = f32x4(1.0, 0.0, 0.0, 0.0); 3045 + try expect(approxEqAbs(rotate(quat, forward), mul(forward, mat), 0.0001)); 3046 + try expect(approxEqAbs(rotate(quat, up), mul(up, mat), 0.0001)); 3047 + try expect(approxEqAbs(rotate(quat, right), mul(right, mat), 0.0001)); 3048 + } 3049 + 3050 + pub fn slerp(q0: Quat, q1: Quat, t: f32) Quat { 3051 + return slerpV(q0, q1, splat(F32x4, t)); 3052 + } 3053 + pub fn slerpV(q0: Quat, q1: Quat, t: F32x4) Quat { 3054 + var cos_omega = dot4(q0, q1); 3055 + const sign = select(cos_omega < splat(F32x4, 0.0), splat(F32x4, -1.0), splat(F32x4, 1.0)); 3056 + 3057 + cos_omega = cos_omega * sign; 3058 + const sin_omega = sqrt(splat(F32x4, 1.0) - cos_omega * cos_omega); 3059 + 3060 + const omega = atan2(sin_omega, cos_omega); 3061 + 3062 + var v01 = t; 3063 + v01 = xorInt(andInt(v01, f32x4_mask2), f32x4_sign_mask1); 3064 + v01 = f32x4(1.0, 0.0, 0.0, 0.0) + v01; 3065 + 3066 + var s0 = sin(v01 * omega) / sin_omega; 3067 + s0 = select(cos_omega < splat(F32x4, 1.0 - 0.00001), s0, v01); 3068 + 3069 + const s1 = swizzle(s0, .y, .y, .y, .y); 3070 + s0 = swizzle(s0, .x, .x, .x, .x); 3071 + 3072 + return q0 * s0 + sign * q1 * s1; 3073 + } 3074 + test "zmath.quaternion.slerp" { 3075 + const from = f32x4(0.0, 0.0, 0.0, 1.0); 3076 + const to = f32x4(0.5, 0.5, -0.5, 0.5); 3077 + const result = slerp(from, to, 0.5); 3078 + try expect(approxEqAbs(result, f32x4(0.28867513, 0.28867513, -0.28867513, 0.86602540), 0.0001)); 3079 + } 3080 + 3081 + // Converts q back to euler angles, assuming a YXZ rotation order. 3082 + // See: http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler 3083 + pub fn quatToRollPitchYaw(q: Quat) [3]f32 { 3084 + var angles: [3]f32 = undefined; 3085 + 3086 + const p = swizzle(q, .w, .y, .x, .z); 3087 + const sign = -1.0; 3088 + 3089 + const singularity = p[0] * p[2] + sign * p[1] * p[3]; 3090 + if (singularity > 0.499) { 3091 + angles[0] = math.pi * 0.5; 3092 + angles[1] = 2.0 * math.atan2(f32, p[1], p[0]); 3093 + angles[2] = 0.0; 3094 + } else if (singularity < -0.499) { 3095 + angles[0] = -math.pi * 0.5; 3096 + angles[1] = 2.0 * math.atan2(f32, p[1], p[0]); 3097 + angles[2] = 0.0; 3098 + } else { 3099 + const sq = p * p; 3100 + const y = splat(F32x4, 2.0) * f32x4(p[0] * p[1] - sign * p[2] * p[3], p[0] * p[3] - sign * p[1] * p[2], 0.0, 0.0); 3101 + const x = splat(F32x4, 1.0) - (splat(F32x4, 2.0) * f32x4(sq[1] + sq[2], sq[2] + sq[3], 0.0, 0.0)); 3102 + const res = atan2(y, x); 3103 + angles[0] = math.asin(2.0 * singularity); 3104 + angles[1] = res[0]; 3105 + angles[2] = res[1]; 3106 + } 3107 + 3108 + return angles; 3109 + } 3110 + 3111 + test "zmath.quaternion.quatToRollPitchYaw" { 3112 + { 3113 + const expected = f32x4(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); 3114 + const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); 3115 + const result = quatToRollPitchYaw(quat); 3116 + try expect(approxEqAbs(loadArr3(result), expected, 0.0001)); 3117 + } 3118 + 3119 + { 3120 + const expected = f32x4(0.3 * math.pi, 0.1 * math.pi, 0.2 * math.pi, 0.0); 3121 + const quat = quatFromRollPitchYaw(expected[0], expected[1], expected[2]); 3122 + const result = quatToRollPitchYaw(quat); 3123 + try expect(approxEqAbs(loadArr3(result), expected, 0.0001)); 3124 + } 3125 + 3126 + // North pole singularity 3127 + { 3128 + const angle = f32x4(0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); 3129 + const expected = f32x4(0.5 * math.pi, -0.1 * math.pi, 0.0, 0.0); 3130 + const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); 3131 + const result = quatToRollPitchYaw(quat); 3132 + try expect(approxEqAbs(loadArr3(result), expected, 0.0001)); 3133 + } 3134 + 3135 + // South pole singularity 3136 + { 3137 + const angle = f32x4(-0.5 * math.pi, 0.2 * math.pi, 0.3 * math.pi, 0.0); 3138 + const expected = f32x4(-0.5 * math.pi, 0.5 * math.pi, 0.0, 0.0); 3139 + const quat = quatFromRollPitchYaw(angle[0], angle[1], angle[2]); 3140 + const result = quatToRollPitchYaw(quat); 3141 + try expect(approxEqAbs(loadArr3(result), expected, 0.0001)); 3142 + } 3143 + } 3144 + 3145 + pub fn quatFromRollPitchYaw(pitch: f32, yaw: f32, roll: f32) Quat { 3146 + return quatFromRollPitchYawV(f32x4(pitch, yaw, roll, 0.0)); 3147 + } 3148 + pub fn quatFromRollPitchYawV(angles: Vec) Quat { // | pitch | yaw | roll | 0 | 3149 + const sc = sincos(splat(Vec, 0.5) * angles); 3150 + const p0 = @shuffle(f32, sc[1], sc[0], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); 3151 + const p1 = @shuffle(f32, sc[0], sc[1], [4]i32{ ~@as(i32, 0), 0, 0, 0 }); 3152 + const y0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); 3153 + const y1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 1, ~@as(i32, 1), 1, 1 }); 3154 + const r0 = @shuffle(f32, sc[1], sc[0], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); 3155 + const r1 = @shuffle(f32, sc[0], sc[1], [4]i32{ 2, 2, ~@as(i32, 2), 2 }); 3156 + const q1 = p1 * f32x4(1.0, -1.0, -1.0, 1.0) * y1; 3157 + const q0 = p0 * y0 * r0; 3158 + return mulAdd(q1, r1, q0); 3159 + } 3160 + test "zmath.quaternion.quatFromRollPitchYawV" { 3161 + { 3162 + const m0 = quatToMat(quatFromRollPitchYawV(f32x4(0.25 * math.pi, 0.0, 0.0, 0.0))); 3163 + const m1 = rotationX(0.25 * math.pi); 3164 + try expect(approxEqAbs(m0[0], m1[0], 0.0001)); 3165 + try expect(approxEqAbs(m0[1], m1[1], 0.0001)); 3166 + try expect(approxEqAbs(m0[2], m1[2], 0.0001)); 3167 + try expect(approxEqAbs(m0[3], m1[3], 0.0001)); 3168 + } 3169 + { 3170 + const m0 = quatToMat(quatFromRollPitchYaw(0.1 * math.pi, 0.2 * math.pi, 0.3 * math.pi)); 3171 + const m1 = mul( 3172 + rotationZ(0.3 * math.pi), 3173 + mul(rotationX(0.1 * math.pi), rotationY(0.2 * math.pi)), 3174 + ); 3175 + try expect(approxEqAbs(m0[0], m1[0], 0.0001)); 3176 + try expect(approxEqAbs(m0[1], m1[1], 0.0001)); 3177 + try expect(approxEqAbs(m0[2], m1[2], 0.0001)); 3178 + try expect(approxEqAbs(m0[3], m1[3], 0.0001)); 3179 + } 3180 + } 3181 + // ------------------------------------------------------------------------------ 3182 + // 3183 + // 6. Color functions 3184 + // 3185 + // ------------------------------------------------------------------------------ 3186 + pub fn adjustSaturation(color: F32x4, saturation: f32) F32x4 { 3187 + const luminance = dot3(f32x4(0.2125, 0.7154, 0.0721, 0.0), color); 3188 + var result = mulAdd(color - luminance, f32x4s(saturation), luminance); 3189 + result[3] = color[3]; 3190 + return result; 3191 + } 3192 + 3193 + pub fn adjustContrast(color: F32x4, contrast: f32) F32x4 { 3194 + var result = mulAdd(color - f32x4s(0.5), f32x4s(contrast), f32x4s(0.5)); 3195 + result[3] = color[3]; 3196 + return result; 3197 + } 3198 + 3199 + pub fn rgbToHsl(rgb: F32x4) F32x4 { 3200 + const r = swizzle(rgb, .x, .x, .x, .x); 3201 + const g = swizzle(rgb, .y, .y, .y, .y); 3202 + const b = swizzle(rgb, .z, .z, .z, .z); 3203 + 3204 + const minv = min(r, min(g, b)); 3205 + const maxv = max(r, max(g, b)); 3206 + 3207 + const l = (minv + maxv) * f32x4s(0.5); 3208 + const d = maxv - minv; 3209 + const la = select(boolx4(true, true, true, false), l, rgb); 3210 + 3211 + if (all(d < f32x4s(math.floatEps(f32)), 3)) { 3212 + return select(boolx4(true, true, false, false), f32x4s(0.0), la); 3213 + } else { 3214 + var s: F32x4 = undefined; 3215 + var h: F32x4 = undefined; 3216 + 3217 + const d2 = minv + maxv; 3218 + 3219 + if (all(l > f32x4s(0.5), 3)) { 3220 + s = d / (f32x4s(2.0) - d2); 3221 + } else { 3222 + s = d / d2; 3223 + } 3224 + 3225 + if (all(r == maxv, 3)) { 3226 + h = (g - b) / d; 3227 + } else if (all(g == maxv, 3)) { 3228 + h = f32x4s(2.0) + (b - r) / d; 3229 + } else { 3230 + h = f32x4s(4.0) + (r - g) / d; 3231 + } 3232 + 3233 + h /= f32x4s(6.0); 3234 + 3235 + if (all(h < f32x4s(0.0), 3)) { 3236 + h += f32x4s(1.0); 3237 + } 3238 + 3239 + const lha = select(boolx4(true, true, false, false), h, la); 3240 + return select(boolx4(true, false, true, true), lha, s); 3241 + } 3242 + } 3243 + test "zmath.color.rgbToHsl" { 3244 + try expect(approxEqAbs(rgbToHsl(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.6, 0.5, 1.0), 0.0001)); 3245 + try expect(approxEqAbs(rgbToHsl(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 0.5, 0.5), 0.0001)); 3246 + try expect(approxEqAbs(rgbToHsl(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 0.5, 0.25), 0.0001)); 3247 + try expect(approxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 0.5, 1.0), 0.0001)); 3248 + try expect(approxEqAbs(rgbToHsl(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001)); 3249 + try expect(approxEqAbs(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001)); 3250 + } 3251 + 3252 + fn hueToClr(p: F32x4, q: F32x4, h: F32x4) F32x4 { 3253 + var t = h; 3254 + 3255 + if (all(t < f32x4s(0.0), 3)) 3256 + t += f32x4s(1.0); 3257 + 3258 + if (all(t > f32x4s(1.0), 3)) 3259 + t -= f32x4s(1.0); 3260 + 3261 + if (all(t < f32x4s(1.0 / 6.0), 3)) 3262 + return mulAdd(q - p, f32x4s(6.0) * t, p); 3263 + 3264 + if (all(t < f32x4s(0.5), 3)) 3265 + return q; 3266 + 3267 + if (all(t < f32x4s(2.0 / 3.0), 3)) 3268 + return mulAdd(q - p, f32x4s(6.0) * (f32x4s(2.0 / 3.0) - t), p); 3269 + 3270 + return p; 3271 + } 3272 + 3273 + pub fn hslToRgb(hsl: F32x4) F32x4 { 3274 + const s = swizzle(hsl, .y, .y, .y, .y); 3275 + const l = swizzle(hsl, .z, .z, .z, .z); 3276 + 3277 + if (all(isNearEqual(s, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) { 3278 + return select(boolx4(true, true, true, false), l, hsl); 3279 + } else { 3280 + const h = swizzle(hsl, .x, .x, .x, .x); 3281 + var q: F32x4 = undefined; 3282 + if (all(l < f32x4s(0.5), 3)) { 3283 + q = l * (f32x4s(1.0) + s); 3284 + } else { 3285 + q = (l + s) - (l * s); 3286 + } 3287 + 3288 + const p = f32x4s(2.0) * l - q; 3289 + 3290 + const r = hueToClr(p, q, h + f32x4s(1.0 / 3.0)); 3291 + const g = hueToClr(p, q, h); 3292 + const b = hueToClr(p, q, h - f32x4s(1.0 / 3.0)); 3293 + 3294 + const rg = select(boolx4(true, false, false, false), r, g); 3295 + const ba = select(boolx4(true, true, true, false), b, hsl); 3296 + return select(boolx4(true, true, false, false), rg, ba); 3297 + } 3298 + } 3299 + test "zmath.color.hslToRgb" { 3300 + try expect(approxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hslToRgb(f32x4(0.6111, 0.6, 0.5, 1.0)), 0.0001)); 3301 + try expect(approxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hslToRgb(f32x4(0.0, 1.0, 0.5, 0.5)), 0.0001)); 3302 + try expect(approxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hslToRgb(f32x4(0.3333, 1.0, 0.5, 0.25)), 0.0005)); 3303 + try expect(approxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hslToRgb(f32x4(0.6666, 1.0, 0.5, 1.0)), 0.0005)); 3304 + try expect(approxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), 0.0001)); 3305 + try expect(approxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hslToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), 0.0001)); 3306 + try expect(approxEqAbs(hslToRgb(rgbToHsl(f32x4(1.0, 1.0, 1.0, 1.0))), f32x4(1.0, 1.0, 1.0, 1.0), 0.0005)); 3307 + try expect(approxEqAbs( 3308 + hslToRgb(rgbToHsl(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3309 + f32x4(0.82198, 0.1839, 0.632, 1.0), 3310 + 0.0005, 3311 + )); 3312 + try expect(approxEqAbs( 3313 + rgbToHsl(hslToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3314 + f32x4(0.82198, 0.1839, 0.632, 1.0), 3315 + 0.0005, 3316 + )); 3317 + try expect(approxEqAbs( 3318 + rgbToHsl(hslToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), 3319 + f32x4(0.1839, 0.82198, 0.632, 1.0), 3320 + 0.0005, 3321 + )); 3322 + try expect(approxEqAbs( 3323 + hslToRgb(rgbToHsl(f32x4(0.1839, 0.632, 0.82198, 1.0))), 3324 + f32x4(0.1839, 0.632, 0.82198, 1.0), 3325 + 0.0005, 3326 + )); 3327 + } 3328 + 3329 + pub fn rgbToHsv(rgb: F32x4) F32x4 { 3330 + const r = swizzle(rgb, .x, .x, .x, .x); 3331 + const g = swizzle(rgb, .y, .y, .y, .y); 3332 + const b = swizzle(rgb, .z, .z, .z, .z); 3333 + 3334 + const minv = min(r, min(g, b)); 3335 + const v = max(r, max(g, b)); 3336 + const d = v - minv; 3337 + const s = if (all(isNearEqual(v, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) f32x4s(0.0) else d / v; 3338 + 3339 + if (all(d < f32x4s(math.floatEps(f32)), 3)) { 3340 + const hv = select(boolx4(true, false, false, false), f32x4s(0.0), v); 3341 + const hva = select(boolx4(true, true, true, false), hv, rgb); 3342 + return select(boolx4(true, false, true, true), hva, s); 3343 + } else { 3344 + var h: F32x4 = undefined; 3345 + if (all(r == v, 3)) { 3346 + h = (g - b) / d; 3347 + if (all(g < b, 3)) 3348 + h += f32x4s(6.0); 3349 + } else if (all(g == v, 3)) { 3350 + h = f32x4s(2.0) + (b - r) / d; 3351 + } else { 3352 + h = f32x4s(4.0) + (r - g) / d; 3353 + } 3354 + 3355 + h /= f32x4s(6.0); 3356 + const hv = select(boolx4(true, false, false, false), h, v); 3357 + const hva = select(boolx4(true, true, true, false), hv, rgb); 3358 + return select(boolx4(true, false, true, true), hva, s); 3359 + } 3360 + } 3361 + test "zmath.color.rgbToHsv" { 3362 + try expect(approxEqAbs(rgbToHsv(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.6111, 0.75, 0.8, 1.0), 0.0001)); 3363 + try expect(approxEqAbs(rgbToHsv(f32x4(0.4, 0.2, 0.8, 1.0)), f32x4(0.7222, 0.75, 0.8, 1.0), 0.0001)); 3364 + try expect(approxEqAbs(rgbToHsv(f32x4(0.4, 0.8, 0.2, 1.0)), f32x4(0.2777, 0.75, 0.8, 1.0), 0.0001)); 3365 + try expect(approxEqAbs(rgbToHsv(f32x4(1.0, 0.0, 0.0, 0.5)), f32x4(0.0, 1.0, 1.0, 0.5), 0.0001)); 3366 + try expect(approxEqAbs(rgbToHsv(f32x4(0.0, 1.0, 0.0, 0.25)), f32x4(0.3333, 1.0, 1.0, 0.25), 0.0001)); 3367 + try expect(approxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 1.0, 1.0)), f32x4(0.6666, 1.0, 1.0, 1.0), 0.0001)); 3368 + try expect(approxEqAbs(rgbToHsv(f32x4(0.0, 0.0, 0.0, 1.0)), f32x4(0.0, 0.0, 0.0, 1.0), 0.0001)); 3369 + try expect(approxEqAbs(rgbToHsv(f32x4(1.0, 1.0, 1.0, 1.0)), f32x4(0.0, 0.0, 1.0, 1.0), 0.0001)); 3370 + } 3371 + 3372 + pub fn hsvToRgb(hsv: F32x4) F32x4 { 3373 + const h = swizzle(hsv, .x, .x, .x, .x); 3374 + const s = swizzle(hsv, .y, .y, .y, .y); 3375 + const v = swizzle(hsv, .z, .z, .z, .z); 3376 + 3377 + const h6 = h * f32x4s(6.0); 3378 + const i = floor(h6); 3379 + const f = h6 - i; 3380 + 3381 + const p = v * (f32x4s(1.0) - s); 3382 + const q = v * (f32x4s(1.0) - f * s); 3383 + const t = v * (f32x4s(1.0) - (f32x4s(1.0) - f) * s); 3384 + 3385 + const ii = @as(i32, @intFromFloat(mod(i, f32x4s(6.0))[0])); 3386 + const rgb = switch (ii) { 3387 + 0 => blk: { 3388 + const vt = select(boolx4(true, false, false, false), v, t); 3389 + break :blk select(boolx4(true, true, false, false), vt, p); 3390 + }, 3391 + 1 => blk: { 3392 + const qv = select(boolx4(true, false, false, false), q, v); 3393 + break :blk select(boolx4(true, true, false, false), qv, p); 3394 + }, 3395 + 2 => blk: { 3396 + const pv = select(boolx4(true, false, false, false), p, v); 3397 + break :blk select(boolx4(true, true, false, false), pv, t); 3398 + }, 3399 + 3 => blk: { 3400 + const pq = select(boolx4(true, false, false, false), p, q); 3401 + break :blk select(boolx4(true, true, false, false), pq, v); 3402 + }, 3403 + 4 => blk: { 3404 + const tp = select(boolx4(true, false, false, false), t, p); 3405 + break :blk select(boolx4(true, true, false, false), tp, v); 3406 + }, 3407 + 5 => blk: { 3408 + const vp = select(boolx4(true, false, false, false), v, p); 3409 + break :blk select(boolx4(true, true, false, false), vp, q); 3410 + }, 3411 + else => unreachable, 3412 + }; 3413 + return select(boolx4(true, true, true, false), rgb, hsv); 3414 + } 3415 + test "zmath.color.hsvToRgb" { 3416 + const epsilon = 0.0005; 3417 + try expect(approxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), hsvToRgb(f32x4(0.6111, 0.75, 0.8, 1.0)), epsilon)); 3418 + try expect(approxEqAbs(f32x4(0.4, 0.2, 0.8, 1.0), hsvToRgb(f32x4(0.7222, 0.75, 0.8, 1.0)), epsilon)); 3419 + try expect(approxEqAbs(f32x4(0.4, 0.8, 0.2, 1.0), hsvToRgb(f32x4(0.2777, 0.75, 0.8, 1.0)), epsilon)); 3420 + try expect(approxEqAbs(f32x4(1.0, 0.0, 0.0, 0.5), hsvToRgb(f32x4(0.0, 1.0, 1.0, 0.5)), epsilon)); 3421 + try expect(approxEqAbs(f32x4(0.0, 1.0, 0.0, 0.25), hsvToRgb(f32x4(0.3333, 1.0, 1.0, 0.25)), epsilon)); 3422 + try expect(approxEqAbs(f32x4(0.0, 0.0, 1.0, 1.0), hsvToRgb(f32x4(0.6666, 1.0, 1.0, 1.0)), epsilon)); 3423 + try expect(approxEqAbs(f32x4(0.0, 0.0, 0.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 0.0, 1.0)), epsilon)); 3424 + try expect(approxEqAbs(f32x4(1.0, 1.0, 1.0, 1.0), hsvToRgb(f32x4(0.0, 0.0, 1.0, 1.0)), epsilon)); 3425 + try expect(approxEqAbs( 3426 + hsvToRgb(rgbToHsv(f32x4(0.1839, 0.632, 0.82198, 1.0))), 3427 + f32x4(0.1839, 0.632, 0.82198, 1.0), 3428 + epsilon, 3429 + )); 3430 + try expect(approxEqAbs( 3431 + hsvToRgb(rgbToHsv(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3432 + f32x4(0.82198, 0.1839, 0.632, 1.0), 3433 + epsilon, 3434 + )); 3435 + try expect(approxEqAbs( 3436 + rgbToHsv(hsvToRgb(f32x4(0.82198, 0.1839, 0.632, 1.0))), 3437 + f32x4(0.82198, 0.1839, 0.632, 1.0), 3438 + epsilon, 3439 + )); 3440 + try expect(approxEqAbs( 3441 + rgbToHsv(hsvToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), 3442 + f32x4(0.1839, 0.82198, 0.632, 1.0), 3443 + epsilon, 3444 + )); 3445 + } 3446 + 3447 + pub fn rgbToSrgb(rgb: F32x4) F32x4 { 3448 + const static = struct { 3449 + const cutoff = f32x4(0.0031308, 0.0031308, 0.0031308, 1.0); 3450 + const linear = f32x4(12.92, 12.92, 12.92, 1.0); 3451 + const scale = f32x4(1.055, 1.055, 1.055, 1.0); 3452 + const bias = f32x4(0.055, 0.055, 0.055, 1.0); 3453 + const rgamma = 1.0 / 2.4; 3454 + }; 3455 + var v = saturate(rgb); 3456 + const v0 = v * static.linear; 3457 + const v1 = static.scale * f32x4( 3458 + math.pow(f32, v[0], static.rgamma), 3459 + math.pow(f32, v[1], static.rgamma), 3460 + math.pow(f32, v[2], static.rgamma), 3461 + v[3], 3462 + ) - static.bias; 3463 + v = select(v < static.cutoff, v0, v1); 3464 + return select(boolx4(true, true, true, false), v, rgb); 3465 + } 3466 + test "zmath.color.rgbToSrgb" { 3467 + const epsilon = 0.001; 3468 + try expect(approxEqAbs(rgbToSrgb(f32x4(0.2, 0.4, 0.8, 1.0)), f32x4(0.484, 0.665, 0.906, 1.0), epsilon)); 3469 + } 3470 + 3471 + pub fn srgbToRgb(srgb: F32x4) F32x4 { 3472 + const static = struct { 3473 + const cutoff = f32x4(0.04045, 0.04045, 0.04045, 1.0); 3474 + const rlinear = f32x4(1.0 / 12.92, 1.0 / 12.92, 1.0 / 12.92, 1.0); 3475 + const scale = f32x4(1.0 / 1.055, 1.0 / 1.055, 1.0 / 1.055, 1.0); 3476 + const bias = f32x4(0.055, 0.055, 0.055, 1.0); 3477 + const gamma = 2.4; 3478 + }; 3479 + var v = saturate(srgb); 3480 + const v0 = v * static.rlinear; 3481 + var v1 = static.scale * (v + static.bias); 3482 + v1 = f32x4( 3483 + math.pow(f32, v1[0], static.gamma), 3484 + math.pow(f32, v1[1], static.gamma), 3485 + math.pow(f32, v1[2], static.gamma), 3486 + v1[3], 3487 + ); 3488 + v = select(v > static.cutoff, v1, v0); 3489 + return select(boolx4(true, true, true, false), v, srgb); 3490 + } 3491 + test "zmath.color.srgbToRgb" { 3492 + const epsilon = 0.0007; 3493 + try expect(approxEqAbs(f32x4(0.2, 0.4, 0.8, 1.0), srgbToRgb(f32x4(0.484, 0.665, 0.906, 1.0)), epsilon)); 3494 + try expect(approxEqAbs( 3495 + rgbToSrgb(srgbToRgb(f32x4(0.1839, 0.82198, 0.632, 1.0))), 3496 + f32x4(0.1839, 0.82198, 0.632, 1.0), 3497 + epsilon, 3498 + )); 3499 + } 3500 + // ------------------------------------------------------------------------------ 3501 + // 3502 + // X. Misc functions 3503 + // 3504 + // ------------------------------------------------------------------------------ 3505 + pub fn linePointDistance(linept0: Vec, linept1: Vec, pt: Vec) F32x4 { 3506 + const ptvec = pt - linept0; 3507 + const linevec = linept1 - linept0; 3508 + const scale = dot3(ptvec, linevec) / lengthSq3(linevec); 3509 + return length3(ptvec - linevec * scale); 3510 + } 3511 + test "zmath.linePointDistance" { 3512 + { 3513 + const linept0 = f32x4(-1.0, -2.0, -3.0, 1.0); 3514 + const linept1 = f32x4(1.0, 2.0, 3.0, 1.0); 3515 + const pt = f32x4(1.0, 1.0, 1.0, 1.0); 3516 + const v = linePointDistance(linept0, linept1, pt); 3517 + try expect(approxEqAbs(v, splat(F32x4, 0.654), 0.001)); 3518 + } 3519 + } 3520 + 3521 + fn sin32(v: f32) f32 { 3522 + var y = v - math.tau * @round(v * 1.0 / math.tau); 3523 + 3524 + if (y > 0.5 * math.pi) { 3525 + y = math.pi - y; 3526 + } else if (y < -math.pi * 0.5) { 3527 + y = -math.pi - y; 3528 + } 3529 + const y2 = y * y; 3530 + 3531 + // 11-degree minimax approximation 3532 + var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); 3533 + sinv = mulAdd(sinv, y2, -0.00019840874); 3534 + sinv = mulAdd(sinv, y2, 0.0083333310); 3535 + sinv = mulAdd(sinv, y2, -0.16666667); 3536 + return y * mulAdd(sinv, y2, 1.0); 3537 + } 3538 + fn cos32(v: f32) f32 { 3539 + var y = v - math.tau * @round(v * 1.0 / math.tau); 3540 + 3541 + const sign = blk: { 3542 + if (y > 0.5 * math.pi) { 3543 + y = math.pi - y; 3544 + break :blk @as(f32, -1.0); 3545 + } else if (y < -math.pi * 0.5) { 3546 + y = -math.pi - y; 3547 + break :blk @as(f32, -1.0); 3548 + } else { 3549 + break :blk @as(f32, 1.0); 3550 + } 3551 + }; 3552 + const y2 = y * y; 3553 + 3554 + // 10-degree minimax approximation 3555 + var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); 3556 + cosv = mulAdd(cosv, y2, -0.0013888378); 3557 + cosv = mulAdd(cosv, y2, 0.041666638); 3558 + cosv = mulAdd(cosv, y2, -0.5); 3559 + return sign * mulAdd(cosv, y2, 1.0); 3560 + } 3561 + fn sincos32(v: f32) [2]f32 { 3562 + var y = v - math.tau * @round(v * 1.0 / math.tau); 3563 + 3564 + const sign = blk: { 3565 + if (y > 0.5 * math.pi) { 3566 + y = math.pi - y; 3567 + break :blk @as(f32, -1.0); 3568 + } else if (y < -math.pi * 0.5) { 3569 + y = -math.pi - y; 3570 + break :blk @as(f32, -1.0); 3571 + } else { 3572 + break :blk @as(f32, 1.0); 3573 + } 3574 + }; 3575 + const y2 = y * y; 3576 + 3577 + // 11-degree minimax approximation 3578 + var sinv = mulAdd(@as(f32, -2.3889859e-08), y2, 2.7525562e-06); 3579 + sinv = mulAdd(sinv, y2, -0.00019840874); 3580 + sinv = mulAdd(sinv, y2, 0.0083333310); 3581 + sinv = mulAdd(sinv, y2, -0.16666667); 3582 + sinv = y * mulAdd(sinv, y2, 1.0); 3583 + 3584 + // 10-degree minimax approximation 3585 + var cosv = mulAdd(@as(f32, -2.6051615e-07), y2, 2.4760495e-05); 3586 + cosv = mulAdd(cosv, y2, -0.0013888378); 3587 + cosv = mulAdd(cosv, y2, 0.041666638); 3588 + cosv = mulAdd(cosv, y2, -0.5); 3589 + cosv = sign * mulAdd(cosv, y2, 1.0); 3590 + 3591 + return .{ sinv, cosv }; 3592 + } 3593 + test "zmath.sincos32" { 3594 + const epsilon = 0.0001; 3595 + 3596 + try expect(math.isNan(sincos32(math.inf(f32))[0])); 3597 + try expect(math.isNan(sincos32(math.inf(f32))[1])); 3598 + try expect(math.isNan(sincos32(-math.inf(f32))[0])); 3599 + try expect(math.isNan(sincos32(-math.inf(f32))[1])); 3600 + try expect(math.isNan(sincos32(math.nan(f32))[0])); 3601 + try expect(math.isNan(sincos32(-math.nan(f32))[1])); 3602 + 3603 + try expect(math.isNan(sin32(math.inf(f32)))); 3604 + try expect(math.isNan(cos32(math.inf(f32)))); 3605 + try expect(math.isNan(sin32(-math.inf(f32)))); 3606 + try expect(math.isNan(cos32(-math.inf(f32)))); 3607 + try expect(math.isNan(sin32(math.nan(f32)))); 3608 + try expect(math.isNan(cos32(-math.nan(f32)))); 3609 + 3610 + var f: f32 = -100.0; 3611 + var i: u32 = 0; 3612 + while (i < 100) : (i += 1) { 3613 + const sc = sincos32(f); 3614 + const s0 = sin32(f); 3615 + const c0 = cos32(f); 3616 + const s = @sin(f); 3617 + const c = @cos(f); 3618 + try expect(math.approxEqAbs(f32, sc[0], s, epsilon)); 3619 + try expect(math.approxEqAbs(f32, sc[1], c, epsilon)); 3620 + try expect(math.approxEqAbs(f32, s0, s, epsilon)); 3621 + try expect(math.approxEqAbs(f32, c0, c, epsilon)); 3622 + f += 0.12345 * @as(f32, @floatFromInt(i)); 3623 + } 3624 + } 3625 + 3626 + fn asin32(v: f32) f32 { 3627 + const x = @abs(v); 3628 + var omx = 1.0 - x; 3629 + if (omx < 0.0) { 3630 + omx = 0.0; 3631 + } 3632 + const root = @sqrt(omx); 3633 + 3634 + // 7-degree minimax approximation 3635 + var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); 3636 + result = mulAdd(result, x, -0.0170881256); 3637 + result = mulAdd(result, x, 0.0308918810); 3638 + result = mulAdd(result, x, -0.0501743046); 3639 + result = mulAdd(result, x, 0.0889789874); 3640 + result = mulAdd(result, x, -0.2145988016); 3641 + result = root * mulAdd(result, x, 1.5707963050); 3642 + 3643 + return if (v >= 0.0) 0.5 * math.pi - result else result - 0.5 * math.pi; 3644 + } 3645 + test "zmath.asin32" { 3646 + const epsilon = 0.0001; 3647 + 3648 + try expect(math.approxEqAbs(f32, asin(@as(f32, -1.1)), -0.5 * math.pi, epsilon)); 3649 + try expect(math.approxEqAbs(f32, asin(@as(f32, 1.1)), 0.5 * math.pi, epsilon)); 3650 + try expect(math.approxEqAbs(f32, asin(@as(f32, -1000.1)), -0.5 * math.pi, epsilon)); 3651 + try expect(math.approxEqAbs(f32, asin(@as(f32, 100000.1)), 0.5 * math.pi, epsilon)); 3652 + try expect(math.isNan(asin(math.inf(f32)))); 3653 + try expect(math.isNan(asin(-math.inf(f32)))); 3654 + try expect(math.isNan(asin(math.nan(f32)))); 3655 + try expect(math.isNan(asin(-math.nan(f32)))); 3656 + 3657 + try expect(approxEqAbs(asin(splat(F32x8, -100.0)), splat(F32x8, -0.5 * math.pi), epsilon)); 3658 + try expect(approxEqAbs(asin(splat(F32x16, 100.0)), splat(F32x16, 0.5 * math.pi), epsilon)); 3659 + try expect(all(isNan(asin(splat(F32x4, math.inf(f32)))), 0) == true); 3660 + try expect(all(isNan(asin(splat(F32x4, -math.inf(f32)))), 0) == true); 3661 + try expect(all(isNan(asin(splat(F32x4, math.nan(f32)))), 0) == true); 3662 + try expect(all(isNan(asin(splat(F32x4, math.snan(f32)))), 0) == true); 3663 + 3664 + var f: f32 = -1.0; 3665 + var i: u32 = 0; 3666 + while (i < 8) : (i += 1) { 3667 + const r0 = asin32(f); 3668 + const r1 = math.asin(f); 3669 + const r4 = asin(splat(F32x4, f)); 3670 + const r8 = asin(splat(F32x8, f)); 3671 + const r16 = asin(splat(F32x16, f)); 3672 + try expect(math.approxEqAbs(f32, r0, r1, epsilon)); 3673 + try expect(approxEqAbs(r4, splat(F32x4, r1), epsilon)); 3674 + try expect(approxEqAbs(r8, splat(F32x8, r1), epsilon)); 3675 + try expect(approxEqAbs(r16, splat(F32x16, r1), epsilon)); 3676 + f += 0.09 * @as(f32, @floatFromInt(i)); 3677 + } 3678 + } 3679 + 3680 + fn acos32(v: f32) f32 { 3681 + const x = @abs(v); 3682 + var omx = 1.0 - x; 3683 + if (omx < 0.0) { 3684 + omx = 0.0; 3685 + } 3686 + const root = @sqrt(omx); 3687 + 3688 + // 7-degree minimax approximation 3689 + var result = mulAdd(@as(f32, -0.0012624911), x, 0.0066700901); 3690 + result = mulAdd(result, x, -0.0170881256); 3691 + result = mulAdd(result, x, 0.0308918810); 3692 + result = mulAdd(result, x, -0.0501743046); 3693 + result = mulAdd(result, x, 0.0889789874); 3694 + result = mulAdd(result, x, -0.2145988016); 3695 + result = root * mulAdd(result, x, 1.5707963050); 3696 + 3697 + return if (v >= 0.0) result else math.pi - result; 3698 + } 3699 + test "zmath.acos32" { 3700 + const epsilon = 0.1; 3701 + 3702 + try expect(math.approxEqAbs(f32, acos(@as(f32, -1.1)), math.pi, epsilon)); 3703 + try expect(math.approxEqAbs(f32, acos(@as(f32, -10000.1)), math.pi, epsilon)); 3704 + try expect(math.approxEqAbs(f32, acos(@as(f32, 1.1)), 0.0, epsilon)); 3705 + try expect(math.approxEqAbs(f32, acos(@as(f32, 1000.1)), 0.0, epsilon)); 3706 + try expect(math.isNan(acos(math.inf(f32)))); 3707 + try expect(math.isNan(acos(-math.inf(f32)))); 3708 + try expect(math.isNan(acos(math.nan(f32)))); 3709 + try expect(math.isNan(acos(-math.nan(f32)))); 3710 + 3711 + try expect(approxEqAbs(acos(splat(F32x8, -100.0)), splat(F32x8, math.pi), epsilon)); 3712 + try expect(approxEqAbs(acos(splat(F32x16, 100.0)), splat(F32x16, 0.0), epsilon)); 3713 + try expect(all(isNan(acos(splat(F32x4, math.inf(f32)))), 0) == true); 3714 + try expect(all(isNan(acos(splat(F32x4, -math.inf(f32)))), 0) == true); 3715 + try expect(all(isNan(acos(splat(F32x4, math.nan(f32)))), 0) == true); 3716 + try expect(all(isNan(acos(splat(F32x4, math.snan(f32)))), 0) == true); 3717 + 3718 + var f: f32 = -1.0; 3719 + var i: u32 = 0; 3720 + while (i < 8) : (i += 1) { 3721 + const r0 = acos32(f); 3722 + const r1 = math.acos(f); 3723 + const r4 = acos(splat(F32x4, f)); 3724 + const r8 = acos(splat(F32x8, f)); 3725 + const r16 = acos(splat(F32x16, f)); 3726 + try expect(math.approxEqAbs(f32, r0, r1, epsilon)); 3727 + try expect(approxEqAbs(r4, splat(F32x4, r1), epsilon)); 3728 + try expect(approxEqAbs(r8, splat(F32x8, r1), epsilon)); 3729 + try expect(approxEqAbs(r16, splat(F32x16, r1), epsilon)); 3730 + f += 0.09 * @as(f32, @floatFromInt(i)); 3731 + } 3732 + } 3733 + 3734 + pub fn modAngle32(in_angle: f32) f32 { 3735 + const angle = in_angle + math.pi; 3736 + var temp: f32 = @abs(angle); 3737 + temp = temp - (2.0 * math.pi * @as(f32, @floatFromInt(@as(i32, @intFromFloat(temp / math.pi))))); 3738 + temp = temp - math.pi; 3739 + if (angle < 0.0) { 3740 + temp = -temp; 3741 + } 3742 + return temp; 3743 + } 3744 + 3745 + pub fn cmulSoa(re0: anytype, im0: anytype, re1: anytype, im1: anytype) [2]@TypeOf(re0, im0, re1, im1) { 3746 + const re0_re1 = re0 * re1; 3747 + const re0_im1 = re0 * im1; 3748 + return .{ 3749 + mulAdd(-im0, im1, re0_re1), // re 3750 + mulAdd(re1, im0, re0_im1), // im 3751 + }; 3752 + } 3753 + // ------------------------------------------------------------------------------ 3754 + // 3755 + // FFT (implementation based on xdsp.h from DirectXMath) 3756 + // 3757 + // ------------------------------------------------------------------------------ 3758 + fn fftButterflyDit4_1(re0: *F32x4, im0: *F32x4) void { 3759 + const re0l = swizzle(re0.*, .x, .x, .y, .y); 3760 + const re0h = swizzle(re0.*, .z, .z, .w, .w); 3761 + 3762 + const im0l = swizzle(im0.*, .x, .x, .y, .y); 3763 + const im0h = swizzle(im0.*, .z, .z, .w, .w); 3764 + 3765 + const re_temp = mulAdd(re0h, f32x4(1.0, -1.0, 1.0, -1.0), re0l); 3766 + const im_temp = mulAdd(im0h, f32x4(1.0, -1.0, 1.0, -1.0), im0l); 3767 + 3768 + const re_shuf0 = @shuffle(f32, re_temp, im_temp, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); 3769 + const re_shuf = swizzle(re_shuf0, .x, .w, .x, .w); 3770 + const im_shuf = swizzle(re_shuf0, .z, .y, .z, .y); 3771 + 3772 + const re_templ = swizzle(re_temp, .x, .y, .x, .y); 3773 + const im_templ = swizzle(im_temp, .x, .y, .x, .y); 3774 + 3775 + re0.* = mulAdd(re_shuf, f32x4(1.0, 1.0, -1.0, -1.0), re_templ); 3776 + im0.* = mulAdd(im_shuf, f32x4(1.0, -1.0, -1.0, 1.0), im_templ); 3777 + } 3778 + 3779 + fn fftButterflyDit4_4( 3780 + re0: *F32x4, 3781 + re1: *F32x4, 3782 + re2: *F32x4, 3783 + re3: *F32x4, 3784 + im0: *F32x4, 3785 + im1: *F32x4, 3786 + im2: *F32x4, 3787 + im3: *F32x4, 3788 + unity_table_re: []const F32x4, 3789 + unity_table_im: []const F32x4, 3790 + stride: u32, 3791 + last: bool, 3792 + ) void { 3793 + const re_temp0 = re0.* + re2.*; 3794 + const im_temp0 = im0.* + im2.*; 3795 + 3796 + const re_temp2 = re1.* + re3.*; 3797 + const im_temp2 = im1.* + im3.*; 3798 + 3799 + const re_temp1 = re0.* - re2.*; 3800 + const im_temp1 = im0.* - im2.*; 3801 + 3802 + const re_temp3 = re1.* - re3.*; 3803 + const im_temp3 = im1.* - im3.*; 3804 + 3805 + var re_temp4 = re_temp0 + re_temp2; 3806 + var im_temp4 = im_temp0 + im_temp2; 3807 + 3808 + var re_temp5 = re_temp1 + im_temp3; 3809 + var im_temp5 = im_temp1 - re_temp3; 3810 + 3811 + var re_temp6 = re_temp0 - re_temp2; 3812 + var im_temp6 = im_temp0 - im_temp2; 3813 + 3814 + var re_temp7 = re_temp1 - im_temp3; 3815 + var im_temp7 = im_temp1 + re_temp3; 3816 + 3817 + { 3818 + const re_im = cmulSoa(re_temp5, im_temp5, unity_table_re[stride], unity_table_im[stride]); 3819 + re_temp5 = re_im[0]; 3820 + im_temp5 = re_im[1]; 3821 + } 3822 + { 3823 + const re_im = cmulSoa(re_temp6, im_temp6, unity_table_re[stride * 2], unity_table_im[stride * 2]); 3824 + re_temp6 = re_im[0]; 3825 + im_temp6 = re_im[1]; 3826 + } 3827 + { 3828 + const re_im = cmulSoa(re_temp7, im_temp7, unity_table_re[stride * 3], unity_table_im[stride * 3]); 3829 + re_temp7 = re_im[0]; 3830 + im_temp7 = re_im[1]; 3831 + } 3832 + 3833 + if (last) { 3834 + fftButterflyDit4_1(&re_temp4, &im_temp4); 3835 + fftButterflyDit4_1(&re_temp5, &im_temp5); 3836 + fftButterflyDit4_1(&re_temp6, &im_temp6); 3837 + fftButterflyDit4_1(&re_temp7, &im_temp7); 3838 + } 3839 + 3840 + re0.* = re_temp4; 3841 + im0.* = im_temp4; 3842 + 3843 + re1.* = re_temp5; 3844 + im1.* = im_temp5; 3845 + 3846 + re2.* = re_temp6; 3847 + im2.* = im_temp6; 3848 + 3849 + re3.* = re_temp7; 3850 + im3.* = im_temp7; 3851 + } 3852 + 3853 + fn fft4(re: []F32x4, im: []F32x4, count: u32) void { 3854 + assert(std.math.isPowerOfTwo(count)); 3855 + assert(re.len >= count); 3856 + assert(im.len >= count); 3857 + 3858 + var index: u32 = 0; 3859 + while (index < count) : (index += 1) { 3860 + fftButterflyDit4_1(&re[index], &im[index]); 3861 + } 3862 + } 3863 + test "zmath.fft4" { 3864 + const epsilon = 0.0001; 3865 + var re = [_]F32x4{f32x4(1.0, 2.0, 3.0, 4.0)}; 3866 + var im = [_]F32x4{f32x4s(0.0)}; 3867 + fft4(re[0..], im[0..], 1); 3868 + 3869 + var re_uns: [1]F32x4 = undefined; 3870 + var im_uns: [1]F32x4 = undefined; 3871 + fftUnswizzle(re[0..], re_uns[0..]); 3872 + fftUnswizzle(im[0..], im_uns[0..]); 3873 + 3874 + try expect(approxEqAbs(re_uns[0], f32x4(10.0, -2.0, -2.0, -2.0), epsilon)); 3875 + try expect(approxEqAbs(im_uns[0], f32x4(0.0, 2.0, 0.0, -2.0), epsilon)); 3876 + } 3877 + 3878 + fn fft8(re: []F32x4, im: []F32x4, count: u32) void { 3879 + assert(std.math.isPowerOfTwo(count)); 3880 + assert(re.len >= 2 * count); 3881 + assert(im.len >= 2 * count); 3882 + 3883 + var index: u32 = 0; 3884 + while (index < count) : (index += 1) { 3885 + var pre = re[index * 2 ..]; 3886 + var pim = im[index * 2 ..]; 3887 + 3888 + var odds_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 3889 + var evens_re = @shuffle(f32, pre[0], pre[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 3890 + var odds_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); 3891 + var evens_im = @shuffle(f32, pim[0], pim[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); 3892 + fftButterflyDit4_1(&odds_re, &odds_im); 3893 + fftButterflyDit4_1(&evens_re, &evens_im); 3894 + 3895 + { 3896 + const re_im = cmulSoa( 3897 + odds_re, 3898 + odds_im, 3899 + f32x4(1.0, 0.70710677, 0.0, -0.70710677), 3900 + f32x4(0.0, -0.70710677, -1.0, -0.70710677), 3901 + ); 3902 + pre[0] = evens_re + re_im[0]; 3903 + pim[0] = evens_im + re_im[1]; 3904 + } 3905 + { 3906 + const re_im = cmulSoa( 3907 + odds_re, 3908 + odds_im, 3909 + f32x4(-1.0, -0.70710677, 0.0, 0.70710677), 3910 + f32x4(0.0, 0.70710677, 1.0, 0.70710677), 3911 + ); 3912 + pre[1] = evens_re + re_im[0]; 3913 + pim[1] = evens_im + re_im[1]; 3914 + } 3915 + } 3916 + } 3917 + test "zmath.fft8" { 3918 + const epsilon = 0.0001; 3919 + var re = [_]F32x4{ f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0) }; 3920 + var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0) }; 3921 + fft8(re[0..], im[0..], 1); 3922 + 3923 + var re_uns: [2]F32x4 = undefined; 3924 + var im_uns: [2]F32x4 = undefined; 3925 + fftUnswizzle(re[0..], re_uns[0..]); 3926 + fftUnswizzle(im[0..], im_uns[0..]); 3927 + 3928 + try expect(approxEqAbs(re_uns[0], f32x4(36.0, -4.0, -4.0, -4.0), epsilon)); 3929 + try expect(approxEqAbs(re_uns[1], f32x4(-4.0, -4.0, -4.0, -4.0), epsilon)); 3930 + try expect(approxEqAbs(im_uns[0], f32x4(0.0, 9.656854, 4.0, 1.656854), epsilon)); 3931 + try expect(approxEqAbs(im_uns[1], f32x4(0.0, -1.656854, -4.0, -9.656854), epsilon)); 3932 + } 3933 + 3934 + fn fft16(re: []F32x4, im: []F32x4, count: u32) void { 3935 + assert(std.math.isPowerOfTwo(count)); 3936 + assert(re.len >= 4 * count); 3937 + assert(im.len >= 4 * count); 3938 + 3939 + const static = struct { 3940 + const unity_table_re = [4]F32x4{ 3941 + f32x4(1.0, 1.0, 1.0, 1.0), 3942 + f32x4(1.0, 0.92387950, 0.70710677, 0.38268343), 3943 + f32x4(1.0, 0.70710677, -4.3711388e-008, -0.70710677), 3944 + f32x4(1.0, 0.38268343, -0.70710677, -0.92387950), 3945 + }; 3946 + const unity_table_im = [4]F32x4{ 3947 + f32x4(-0.0, -0.0, -0.0, -0.0), 3948 + f32x4(-0.0, -0.38268343, -0.70710677, -0.92387950), 3949 + f32x4(-0.0, -0.70710677, -1.0, -0.70710677), 3950 + f32x4(-0.0, -0.92387950, -0.70710677, 0.38268343), 3951 + }; 3952 + }; 3953 + 3954 + var index: u32 = 0; 3955 + while (index < count) : (index += 1) { 3956 + fftButterflyDit4_4( 3957 + &re[index * 4], 3958 + &re[index * 4 + 1], 3959 + &re[index * 4 + 2], 3960 + &re[index * 4 + 3], 3961 + &im[index * 4], 3962 + &im[index * 4 + 1], 3963 + &im[index * 4 + 2], 3964 + &im[index * 4 + 3], 3965 + static.unity_table_re[0..], 3966 + static.unity_table_im[0..], 3967 + 1, 3968 + true, 3969 + ); 3970 + } 3971 + } 3972 + test "zmath.fft16" { 3973 + const epsilon = 0.0001; 3974 + var re = [_]F32x4{ 3975 + f32x4(1.0, 2.0, 3.0, 4.0), 3976 + f32x4(5.0, 6.0, 7.0, 8.0), 3977 + f32x4(9.0, 10.0, 11.0, 12.0), 3978 + f32x4(13.0, 14.0, 15.0, 16.0), 3979 + }; 3980 + var im = [_]F32x4{ f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0) }; 3981 + fft16(re[0..], im[0..], 1); 3982 + 3983 + var re_uns: [4]F32x4 = undefined; 3984 + var im_uns: [4]F32x4 = undefined; 3985 + fftUnswizzle(re[0..], re_uns[0..]); 3986 + fftUnswizzle(im[0..], im_uns[0..]); 3987 + 3988 + try expect(approxEqAbs(re_uns[0], f32x4(136.0, -8.0, -8.0, -8.0), epsilon)); 3989 + try expect(approxEqAbs(re_uns[1], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon)); 3990 + try expect(approxEqAbs(re_uns[2], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon)); 3991 + try expect(approxEqAbs(re_uns[3], f32x4(-8.0, -8.0, -8.0, -8.0), epsilon)); 3992 + try expect(approxEqAbs(im_uns[0], f32x4(0.0, 40.218716, 19.313708, 11.972846), epsilon)); 3993 + try expect(approxEqAbs(im_uns[1], f32x4(8.0, 5.345429, 3.313708, 1.591299), epsilon)); 3994 + try expect(approxEqAbs(im_uns[2], f32x4(0.0, -1.591299, -3.313708, -5.345429), epsilon)); 3995 + try expect(approxEqAbs(im_uns[3], f32x4(-8.0, -11.972846, -19.313708, -40.218716), epsilon)); 3996 + } 3997 + 3998 + fn fftN(re: []F32x4, im: []F32x4, unity_table: []const F32x4, length: u32, count: u32) void { 3999 + assert(length > 16); 4000 + assert(std.math.isPowerOfTwo(length)); 4001 + assert(std.math.isPowerOfTwo(count)); 4002 + assert(re.len >= length * count / 4); 4003 + assert(re.len == im.len); 4004 + 4005 + const total = count * length; 4006 + const total_vectors = total / 4; 4007 + const stage_vectors = length / 4; 4008 + const stage_vectors_mask = stage_vectors - 1; 4009 + const stride = length / 16; 4010 + const stride_mask = stride - 1; 4011 + const stride_inv_mask = ~stride_mask; 4012 + 4013 + var unity_table_re = unity_table; 4014 + var unity_table_im = unity_table[length / 4 ..]; 4015 + 4016 + var index: u32 = 0; 4017 + while (index < total_vectors / 4) : (index += 1) { 4018 + const n = (index & stride_inv_mask) * 4 + (index & stride_mask); 4019 + fftButterflyDit4_4( 4020 + &re[n], 4021 + &re[n + stride], 4022 + &re[n + stride * 2], 4023 + &re[n + stride * 3], 4024 + &im[n], 4025 + &im[n + stride], 4026 + &im[n + stride * 2], 4027 + &im[n + stride * 3], 4028 + unity_table_re[(n & stage_vectors_mask)..], 4029 + unity_table_im[(n & stage_vectors_mask)..], 4030 + stride, 4031 + false, 4032 + ); 4033 + } 4034 + 4035 + if (length > 16 * 4) { 4036 + fftN(re, im, unity_table[(length / 2)..], length / 4, count * 4); 4037 + } else if (length == 16 * 4) { 4038 + fft16(re, im, count * 4); 4039 + } else if (length == 8 * 4) { 4040 + fft8(re, im, count * 4); 4041 + } else if (length == 4 * 4) { 4042 + fft4(re, im, count * 4); 4043 + } 4044 + } 4045 + test "zmath.fftN" { 4046 + var unity_table: [128]F32x4 = undefined; 4047 + const epsilon = 0.0001; 4048 + 4049 + // 32 samples 4050 + { 4051 + var re = [_]F32x4{ 4052 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4053 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4054 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4055 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4056 + }; 4057 + var im = [_]F32x4{ 4058 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4059 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4060 + }; 4061 + 4062 + fftInitUnityTable(unity_table[0..32]); 4063 + fft(re[0..], im[0..], unity_table[0..32]); 4064 + 4065 + try expect(approxEqAbs(re[0], f32x4(528.0, -16.0, -16.0, -16.0), epsilon)); 4066 + try expect(approxEqAbs(re[1], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon)); 4067 + try expect(approxEqAbs(re[2], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon)); 4068 + try expect(approxEqAbs(re[3], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon)); 4069 + try expect(approxEqAbs(re[4], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon)); 4070 + try expect(approxEqAbs(re[5], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon)); 4071 + try expect(approxEqAbs(re[6], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon)); 4072 + try expect(approxEqAbs(re[7], f32x4(-16.0, -16.0, -16.0, -16.0), epsilon)); 4073 + try expect(approxEqAbs(im[0], f32x4(0.0, 162.450726, 80.437432, 52.744931), epsilon)); 4074 + try expect(approxEqAbs(im[1], f32x4(38.627417, 29.933895, 23.945692, 19.496056), epsilon)); 4075 + try expect(approxEqAbs(im[2], f32x4(16.0, 13.130861, 10.690858, 8.552178), epsilon)); 4076 + try expect(approxEqAbs(im[3], f32x4(6.627417, 4.853547, 3.182598, 1.575862), epsilon)); 4077 + try expect(approxEqAbs(im[4], f32x4(0.0, -1.575862, -3.182598, -4.853547), epsilon)); 4078 + try expect(approxEqAbs(im[5], f32x4(-6.627417, -8.552178, -10.690858, -13.130861), epsilon)); 4079 + try expect(approxEqAbs(im[6], f32x4(-16.0, -19.496056, -23.945692, -29.933895), epsilon)); 4080 + try expect(approxEqAbs(im[7], f32x4(-38.627417, -52.744931, -80.437432, -162.450726), epsilon)); 4081 + } 4082 + 4083 + // 64 samples 4084 + { 4085 + var re = [_]F32x4{ 4086 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4087 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4088 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4089 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4090 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4091 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4092 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4093 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4094 + }; 4095 + var im = [_]F32x4{ 4096 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4097 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4098 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4099 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4100 + }; 4101 + 4102 + fftInitUnityTable(unity_table[0..64]); 4103 + fft(re[0..], im[0..], unity_table[0..64]); 4104 + 4105 + try expect(approxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon)); 4106 + var i: u32 = 1; 4107 + while (i < 16) : (i += 1) { 4108 + try expect(approxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon)); 4109 + } 4110 + 4111 + const expected = [_]f32{ 4112 + 0.0, 0.0, 324.901452, 0.000000, 160.874864, 0.0, 105.489863, 0.000000, 4113 + 77.254834, 0.0, 59.867789, 0.0, 47.891384, 0.0, 38.992113, 0.0, 4114 + 32.000000, 0.000000, 26.261721, 0.000000, 21.381716, 0.000000, 17.104356, 0.000000, 4115 + 13.254834, 0.000000, 9.707094, 0.000000, 6.365196, 0.000000, 3.151725, 0.000000, 4116 + 0.000000, 0.000000, -3.151725, 0.000000, -6.365196, 0.000000, -9.707094, 0.000000, 4117 + -13.254834, 0.000000, -17.104356, 0.000000, -21.381716, 0.000000, -26.261721, 0.000000, 4118 + -32.000000, 0.000000, -38.992113, 0.000000, -47.891384, 0.000000, -59.867789, 0.000000, 4119 + -77.254834, 0.000000, -105.489863, 0.000000, -160.874864, 0.000000, -324.901452, 0.000000, 4120 + }; 4121 + for (expected, 0..) |e, ie| { 4122 + try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); 4123 + } 4124 + } 4125 + 4126 + // 128 samples 4127 + { 4128 + var re = [_]F32x4{ 4129 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4130 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4131 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4132 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4133 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4134 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4135 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4136 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4137 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4138 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4139 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4140 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4141 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4142 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4143 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4144 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4145 + }; 4146 + var im = [_]F32x4{ 4147 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4148 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4149 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4150 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4151 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4152 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4153 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4154 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4155 + }; 4156 + 4157 + fftInitUnityTable(unity_table[0..128]); 4158 + fft(re[0..], im[0..], unity_table[0..128]); 4159 + 4160 + try expect(approxEqAbs(re[0], f32x4(2112.0, 0.0, 0.0, 0.0), epsilon)); 4161 + var i: u32 = 1; 4162 + while (i < 32) : (i += 1) { 4163 + try expect(approxEqAbs(re[i], f32x4(-64.0, 0.0, 0.0, 0.0), epsilon)); 4164 + } 4165 + 4166 + const expected = [_]f32{ 4167 + 0.000000, 0.000000, 0.000000, 0.000000, 649.802905, 0.000000, 0.000000, 0.000000, 4168 + 321.749727, 0.000000, 0.000000, 0.000000, 210.979725, 0.000000, 0.000000, 0.000000, 4169 + 154.509668, 0.000000, 0.000000, 0.000000, 119.735578, 0.000000, 0.000000, 0.000000, 4170 + 95.782769, 0.000000, 0.000000, 0.000000, 77.984226, 0.000000, 0.000000, 0.000000, 4171 + 64.000000, 0.000000, 0.000000, 0.000000, 52.523443, 0.000000, 0.000000, 0.000000, 4172 + 42.763433, 0.000000, 0.000000, 0.000000, 34.208713, 0.000000, 0.000000, 0.000000, 4173 + 26.509668, 0.000000, 0.000000, 0.000000, 19.414188, 0.000000, 0.000000, 0.000000, 4174 + 12.730392, 0.000000, 0.000000, 0.000000, 6.303450, 0.000000, 0.000000, 0.000000, 4175 + 0.000000, 0.000000, 0.000000, 0.000000, -6.303450, 0.000000, 0.000000, 0.000000, 4176 + -12.730392, 0.000000, 0.000000, 0.000000, -19.414188, 0.000000, 0.000000, 0.000000, 4177 + -26.509668, 0.000000, 0.000000, 0.000000, -34.208713, 0.000000, 0.000000, 0.000000, 4178 + -42.763433, 0.000000, 0.000000, 0.000000, -52.523443, 0.000000, 0.000000, 0.000000, 4179 + -64.000000, 0.000000, 0.000000, 0.000000, -77.984226, 0.000000, 0.000000, 0.000000, 4180 + -95.782769, 0.000000, 0.000000, 0.000000, -119.735578, 0.000000, 0.000000, 0.000000, 4181 + -154.509668, 0.000000, 0.000000, 0.000000, -210.979725, 0.000000, 0.000000, 0.000000, 4182 + -321.749727, 0.000000, 0.000000, 0.000000, -649.802905, 0.000000, 0.000000, 0.000000, 4183 + }; 4184 + for (expected, 0..) |e, ie| { 4185 + try expect(std.math.approxEqAbs(f32, e, im[(ie / 4)][ie % 4], epsilon)); 4186 + } 4187 + } 4188 + } 4189 + 4190 + fn fftUnswizzle(input: []const F32x4, output: []F32x4) void { 4191 + assert(std.math.isPowerOfTwo(input.len)); 4192 + assert(input.len == output.len); 4193 + assert(input.ptr != output.ptr); 4194 + 4195 + const log2_length = std.math.log2_int(usize, input.len * 4); 4196 + assert(log2_length >= 2); 4197 + 4198 + const length = input.len; 4199 + 4200 + const f32_output = @as([*]f32, @ptrCast(output.ptr))[0 .. output.len * 4]; 4201 + 4202 + const static = struct { 4203 + const swizzle_table = [256]u8{ 4204 + 0x00, 0x40, 0x80, 0xC0, 0x10, 0x50, 0x90, 0xD0, 0x20, 0x60, 0xA0, 0xE0, 0x30, 0x70, 0xB0, 0xF0, 4205 + 0x04, 0x44, 0x84, 0xC4, 0x14, 0x54, 0x94, 0xD4, 0x24, 0x64, 0xA4, 0xE4, 0x34, 0x74, 0xB4, 0xF4, 4206 + 0x08, 0x48, 0x88, 0xC8, 0x18, 0x58, 0x98, 0xD8, 0x28, 0x68, 0xA8, 0xE8, 0x38, 0x78, 0xB8, 0xF8, 4207 + 0x0C, 0x4C, 0x8C, 0xCC, 0x1C, 0x5C, 0x9C, 0xDC, 0x2C, 0x6C, 0xAC, 0xEC, 0x3C, 0x7C, 0xBC, 0xFC, 4208 + 0x01, 0x41, 0x81, 0xC1, 0x11, 0x51, 0x91, 0xD1, 0x21, 0x61, 0xA1, 0xE1, 0x31, 0x71, 0xB1, 0xF1, 4209 + 0x05, 0x45, 0x85, 0xC5, 0x15, 0x55, 0x95, 0xD5, 0x25, 0x65, 0xA5, 0xE5, 0x35, 0x75, 0xB5, 0xF5, 4210 + 0x09, 0x49, 0x89, 0xC9, 0x19, 0x59, 0x99, 0xD9, 0x29, 0x69, 0xA9, 0xE9, 0x39, 0x79, 0xB9, 0xF9, 4211 + 0x0D, 0x4D, 0x8D, 0xCD, 0x1D, 0x5D, 0x9D, 0xDD, 0x2D, 0x6D, 0xAD, 0xED, 0x3D, 0x7D, 0xBD, 0xFD, 4212 + 0x02, 0x42, 0x82, 0xC2, 0x12, 0x52, 0x92, 0xD2, 0x22, 0x62, 0xA2, 0xE2, 0x32, 0x72, 0xB2, 0xF2, 4213 + 0x06, 0x46, 0x86, 0xC6, 0x16, 0x56, 0x96, 0xD6, 0x26, 0x66, 0xA6, 0xE6, 0x36, 0x76, 0xB6, 0xF6, 4214 + 0x0A, 0x4A, 0x8A, 0xCA, 0x1A, 0x5A, 0x9A, 0xDA, 0x2A, 0x6A, 0xAA, 0xEA, 0x3A, 0x7A, 0xBA, 0xFA, 4215 + 0x0E, 0x4E, 0x8E, 0xCE, 0x1E, 0x5E, 0x9E, 0xDE, 0x2E, 0x6E, 0xAE, 0xEE, 0x3E, 0x7E, 0xBE, 0xFE, 4216 + 0x03, 0x43, 0x83, 0xC3, 0x13, 0x53, 0x93, 0xD3, 0x23, 0x63, 0xA3, 0xE3, 0x33, 0x73, 0xB3, 0xF3, 4217 + 0x07, 0x47, 0x87, 0xC7, 0x17, 0x57, 0x97, 0xD7, 0x27, 0x67, 0xA7, 0xE7, 0x37, 0x77, 0xB7, 0xF7, 4218 + 0x0B, 0x4B, 0x8B, 0xCB, 0x1B, 0x5B, 0x9B, 0xDB, 0x2B, 0x6B, 0xAB, 0xEB, 0x3B, 0x7B, 0xBB, 0xFB, 4219 + 0x0F, 0x4F, 0x8F, 0xCF, 0x1F, 0x5F, 0x9F, 0xDF, 0x2F, 0x6F, 0xAF, 0xEF, 0x3F, 0x7F, 0xBF, 0xFF, 4220 + }; 4221 + }; 4222 + 4223 + if ((log2_length & 1) == 0) { 4224 + const rev32 = @as(u6, @intCast(32 - log2_length)); 4225 + var index: usize = 0; 4226 + while (index < length) : (index += 1) { 4227 + const n = index * 4; 4228 + const addr = 4229 + (@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | 4230 + (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | 4231 + (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | 4232 + @as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])); 4233 + f32_output[addr >> rev32] = input[index][0]; 4234 + f32_output[(0x40000000 | addr) >> rev32] = input[index][1]; 4235 + f32_output[(0x80000000 | addr) >> rev32] = input[index][2]; 4236 + f32_output[(0xC0000000 | addr) >> rev32] = input[index][3]; 4237 + } 4238 + } else { 4239 + const rev7 = @as(usize, 1) << @as(u6, @intCast(log2_length - 3)); 4240 + const rev32 = @as(u6, @intCast(32 - (log2_length - 3))); 4241 + var index: usize = 0; 4242 + while (index < length) : (index += 1) { 4243 + const n = index / 2; 4244 + var addr = 4245 + (((@as(usize, @intCast(static.swizzle_table[n & 0xff])) << 24) | 4246 + (@as(usize, @intCast(static.swizzle_table[(n >> 8) & 0xff])) << 16) | 4247 + (@as(usize, @intCast(static.swizzle_table[(n >> 16) & 0xff])) << 8) | 4248 + (@as(usize, @intCast(static.swizzle_table[(n >> 24) & 0xff])))) >> rev32) | 4249 + ((index & 1) * rev7 * 4); 4250 + f32_output[addr] = input[index][0]; 4251 + addr += rev7; 4252 + f32_output[addr] = input[index][1]; 4253 + addr += rev7; 4254 + f32_output[addr] = input[index][2]; 4255 + addr += rev7; 4256 + f32_output[addr] = input[index][3]; 4257 + } 4258 + } 4259 + } 4260 + 4261 + pub fn fftInitUnityTable(out_unity_table: []F32x4) void { 4262 + assert(std.math.isPowerOfTwo(out_unity_table.len)); 4263 + assert(out_unity_table.len >= 32 and out_unity_table.len <= 512); 4264 + 4265 + var unity_table = out_unity_table; 4266 + 4267 + const v0123 = f32x4(0.0, 1.0, 2.0, 3.0); 4268 + var length = out_unity_table.len / 4; 4269 + var vlstep = f32x4s(0.5 * math.pi / @as(f32, @floatFromInt(length))); 4270 + 4271 + while (true) { 4272 + length /= 4; 4273 + var vjp = v0123; 4274 + 4275 + var j: u32 = 0; 4276 + while (j < length) : (j += 1) { 4277 + unity_table[j] = f32x4s(1.0); 4278 + unity_table[j + length * 4] = f32x4s(0.0); 4279 + 4280 + var vls = vjp * vlstep; 4281 + var sin_cos = sincos(vls); 4282 + unity_table[j + length] = sin_cos[1]; 4283 + unity_table[j + length * 5] = sin_cos[0] * f32x4s(-1.0); 4284 + 4285 + var vijp = vjp + vjp; 4286 + vls = vijp * vlstep; 4287 + sin_cos = sincos(vls); 4288 + unity_table[j + length * 2] = sin_cos[1]; 4289 + unity_table[j + length * 6] = sin_cos[0] * f32x4s(-1.0); 4290 + 4291 + vijp = vijp + vjp; 4292 + vls = vijp * vlstep; 4293 + sin_cos = sincos(vls); 4294 + unity_table[j + length * 3] = sin_cos[1]; 4295 + unity_table[j + length * 7] = sin_cos[0] * f32x4s(-1.0); 4296 + 4297 + vjp += f32x4s(4.0); 4298 + } 4299 + vlstep *= f32x4s(4.0); 4300 + unity_table = unity_table[8 * length ..]; 4301 + 4302 + if (length <= 4) 4303 + break; 4304 + } 4305 + } 4306 + 4307 + pub fn fft(re: []F32x4, im: []F32x4, unity_table: []const F32x4) void { 4308 + const length = @as(u32, @intCast(re.len * 4)); 4309 + assert(std.math.isPowerOfTwo(length)); 4310 + assert(length >= 4 and length <= 512); 4311 + assert(re.len == im.len); 4312 + 4313 + var re_temp_storage: [128]F32x4 = undefined; 4314 + var im_temp_storage: [128]F32x4 = undefined; 4315 + const re_temp = re_temp_storage[0..re.len]; 4316 + const im_temp = im_temp_storage[0..im.len]; 4317 + 4318 + @memcpy(re_temp, re); 4319 + @memcpy(im_temp, im); 4320 + 4321 + if (length > 16) { 4322 + assert(unity_table.len == length); 4323 + fftN(re_temp, im_temp, unity_table, length, 1); 4324 + } else if (length == 16) { 4325 + fft16(re_temp, im_temp, 1); 4326 + } else if (length == 8) { 4327 + fft8(re_temp, im_temp, 1); 4328 + } else if (length == 4) { 4329 + fft4(re_temp, im_temp, 1); 4330 + } 4331 + 4332 + fftUnswizzle(re_temp, re); 4333 + fftUnswizzle(im_temp, im); 4334 + } 4335 + 4336 + pub fn ifft(re: []F32x4, im: []const F32x4, unity_table: []const F32x4) void { 4337 + const length = @as(u32, @intCast(re.len * 4)); 4338 + assert(std.math.isPowerOfTwo(length)); 4339 + assert(length >= 4 and length <= 512); 4340 + assert(re.len == im.len); 4341 + 4342 + var re_temp_storage: [128]F32x4 = undefined; 4343 + var im_temp_storage: [128]F32x4 = undefined; 4344 + var re_temp = re_temp_storage[0..re.len]; 4345 + var im_temp = im_temp_storage[0..im.len]; 4346 + 4347 + const rnp = f32x4s(1.0 / @as(f32, @floatFromInt(length))); 4348 + const rnm = f32x4s(-1.0 / @as(f32, @floatFromInt(length))); 4349 + 4350 + for (re, 0..) |_, i| { 4351 + re_temp[i] = re[i] * rnp; 4352 + im_temp[i] = im[i] * rnm; 4353 + } 4354 + 4355 + if (length > 16) { 4356 + assert(unity_table.len == length); 4357 + fftN(re_temp, im_temp, unity_table, length, 1); 4358 + } else if (length == 16) { 4359 + fft16(re_temp, im_temp, 1); 4360 + } else if (length == 8) { 4361 + fft8(re_temp, im_temp, 1); 4362 + } else if (length == 4) { 4363 + fft4(re_temp, im_temp, 1); 4364 + } 4365 + 4366 + fftUnswizzle(re_temp, re); 4367 + } 4368 + test "zmath.ifft" { 4369 + var unity_table: [512]F32x4 = undefined; 4370 + const epsilon = 0.0001; 4371 + 4372 + // 64 samples 4373 + { 4374 + var re = [_]F32x4{ 4375 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4376 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4377 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4378 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4379 + f32x4(1.0, 2.0, 3.0, 4.0), f32x4(5.0, 6.0, 7.0, 8.0), 4380 + f32x4(9.0, 10.0, 11.0, 12.0), f32x4(13.0, 14.0, 15.0, 16.0), 4381 + f32x4(17.0, 18.0, 19.0, 20.0), f32x4(21.0, 22.0, 23.0, 24.0), 4382 + f32x4(25.0, 26.0, 27.0, 28.0), f32x4(29.0, 30.0, 31.0, 32.0), 4383 + }; 4384 + var im = [_]F32x4{ 4385 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4386 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4387 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4388 + f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), f32x4s(0.0), 4389 + }; 4390 + 4391 + fftInitUnityTable(unity_table[0..64]); 4392 + fft(re[0..], im[0..], unity_table[0..64]); 4393 + 4394 + try expect(approxEqAbs(re[0], f32x4(1056.0, 0.0, -32.0, 0.0), epsilon)); 4395 + var i: u32 = 1; 4396 + while (i < 16) : (i += 1) { 4397 + try expect(approxEqAbs(re[i], f32x4(-32.0, 0.0, -32.0, 0.0), epsilon)); 4398 + } 4399 + 4400 + ifft(re[0..], im[0..], unity_table[0..64]); 4401 + 4402 + try expect(approxEqAbs(re[0], f32x4(1.0, 2.0, 3.0, 4.0), epsilon)); 4403 + try expect(approxEqAbs(re[1], f32x4(5.0, 6.0, 7.0, 8.0), epsilon)); 4404 + try expect(approxEqAbs(re[2], f32x4(9.0, 10.0, 11.0, 12.0), epsilon)); 4405 + try expect(approxEqAbs(re[3], f32x4(13.0, 14.0, 15.0, 16.0), epsilon)); 4406 + try expect(approxEqAbs(re[4], f32x4(17.0, 18.0, 19.0, 20.0), epsilon)); 4407 + try expect(approxEqAbs(re[5], f32x4(21.0, 22.0, 23.0, 24.0), epsilon)); 4408 + try expect(approxEqAbs(re[6], f32x4(25.0, 26.0, 27.0, 28.0), epsilon)); 4409 + try expect(approxEqAbs(re[7], f32x4(29.0, 30.0, 31.0, 32.0), epsilon)); 4410 + } 4411 + 4412 + // 512 samples 4413 + { 4414 + var re: [128]F32x4 = undefined; 4415 + var im = [_]F32x4{f32x4s(0.0)} ** 128; 4416 + 4417 + for (&re, 0..) |*v, i| { 4418 + const f = @as(f32, @floatFromInt(i * 4)); 4419 + v.* = f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0); 4420 + } 4421 + 4422 + fftInitUnityTable(unity_table[0..512]); 4423 + fft(re[0..], im[0..], unity_table[0..512]); 4424 + 4425 + for (re, 0..) |v, i| { 4426 + const f = @as(f32, @floatFromInt(i * 4)); 4427 + try expect(!approxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon)); 4428 + } 4429 + 4430 + ifft(re[0..], im[0..], unity_table[0..512]); 4431 + 4432 + for (re, 0..) |v, i| { 4433 + const f = @as(f32, @floatFromInt(i * 4)); 4434 + try expect(approxEqAbs(v, f32x4(f + 1.0, f + 2.0, f + 3.0, f + 4.0), epsilon)); 4435 + } 4436 + } 4437 + } 4438 + // ------------------------------------------------------------------------------ 4439 + // 4440 + // Private functions and constants 4441 + // 4442 + // ------------------------------------------------------------------------------ 4443 + const f32x4_sign_mask1: F32x4 = F32x4{ @as(f32, @bitCast(@as(u32, 0x8000_0000))), 0, 0, 0 }; 4444 + const f32x4_mask2: F32x4 = F32x4{ 4445 + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4446 + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4447 + 0, 4448 + 0, 4449 + }; 4450 + const f32x4_mask3: F32x4 = F32x4{ 4451 + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4452 + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4453 + @as(f32, @bitCast(@as(u32, 0xffff_ffff))), 4454 + 0, 4455 + }; 4456 + 4457 + inline fn splatNegativeZero(comptime T: type) T { 4458 + return @splat(@as(f32, @bitCast(@as(u32, 0x8000_0000)))); 4459 + } 4460 + inline fn splatNoFraction(comptime T: type) T { 4461 + return @splat(@as(f32, 8_388_608.0)); 4462 + } 4463 + inline fn splatAbsMask(comptime T: type) T { 4464 + return @splat(@as(f32, @bitCast(@as(u32, 0x7fff_ffff)))); 4465 + } 4466 + 4467 + fn floatToIntAndBack(v: anytype) @TypeOf(v) { 4468 + // This routine won't handle nan, inf and numbers greater than 8_388_608.0 (will generate undefined values). 4469 + @setRuntimeSafety(false); 4470 + 4471 + const T = @TypeOf(v); 4472 + const len = veclen(T); 4473 + 4474 + var vi32: [len]i32 = undefined; 4475 + comptime var i: u32 = 0; 4476 + // vcvttps2dq 4477 + inline while (i < len) : (i += 1) { 4478 + vi32[i] = @as(i32, @intFromFloat(v[i])); 4479 + } 4480 + 4481 + var vf32: [len]f32 = undefined; 4482 + i = 0; 4483 + // vcvtdq2ps 4484 + inline while (i < len) : (i += 1) { 4485 + vf32[i] = @as(f32, @floatFromInt(vi32[i])); 4486 + } 4487 + 4488 + return vf32; 4489 + } 4490 + test "zmath.floatToIntAndBack" { 4491 + { 4492 + const v = floatToIntAndBack(f32x4(1.1, 2.9, 3.0, -4.5)); 4493 + try expect(approxEqAbs(v, f32x4(1.0, 2.0, 3.0, -4.0), 0.0)); 4494 + } 4495 + { 4496 + const v = floatToIntAndBack(f32x8(1.1, 2.9, 3.0, -4.5, 2.5, -2.5, 1.1, -100.2)); 4497 + try expect(approxEqAbs(v, f32x8(1.0, 2.0, 3.0, -4.0, 2.0, -2.0, 1.0, -100.0), 0.0)); 4498 + } 4499 + { 4500 + const v = floatToIntAndBack(f32x4(math.inf(f32), 2.9, math.nan(f32), math.snan(f32))); 4501 + try expect(v[1] == 2.0); 4502 + } 4503 + } 4504 + 4505 + pub fn approxEqAbs(v0: anytype, v1: anytype, eps: f32) bool { 4506 + const T = @TypeOf(v0, v1); 4507 + comptime var i: comptime_int = 0; 4508 + inline while (i < veclen(T)) : (i += 1) { 4509 + if (!math.approxEqAbs(f32, v0[i], v1[i], eps)) { 4510 + return false; 4511 + } 4512 + } 4513 + return true; 4514 + } 4515 + 4516 + // ------------------------------------------------------------------------------ 4517 + // This software is available under 2 licenses -- choose whichever you prefer. 4518 + // ------------------------------------------------------------------------------ 4519 + // ALTERNATIVE A - MIT License 4520 + // Copyright (c) 2022 Michal Ziulek and Contributors 4521 + // Permission is hereby granted, free of charge, to any person obtaining a copy of 4522 + // this software and associated documentation files (the "Software"), to deal in 4523 + // the Software without restriction, including without limitation the rights to 4524 + // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 4525 + // of the Software, and to permit persons to whom the Software is furnished to do 4526 + // so, subject to the following conditions: 4527 + // The above copyright notice and this permission notice shall be included in all 4528 + // copies or substantial portions of the Software. 4529 + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 4530 + // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 4531 + // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 4532 + // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 4533 + // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 4534 + // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 4535 + // SOFTWARE. 4536 + // ------------------------------------------------------------------------------ 4537 + // ALTERNATIVE B - Public Domain (www.unlicense.org) 4538 + // This is free and unencumbered software released into the public domain. 4539 + // Anyone is free to copy, modify, publish, use, compile, sell, or distribute this 4540 + // software, either in source code form or as a compiled binary, for any purpose, 4541 + // commercial or non-commercial, and by any means. 4542 + // In jurisdictions that recognize copyright laws, the author or authors of this 4543 + // software dedicate any and all copyright interest in the software to the public 4544 + // domain. We make this dedication for the benefit of the public at large and to 4545 + // the detriment of our heirs and successors. We intend this dedication to be an 4546 + // overt act of relinquishment in perpetuity of all present and future rights to 4547 + // this software under copyright law. 4548 + // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 4549 + // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 4550 + // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 4551 + // AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 4552 + // ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 4553 + // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 4554 + // ------------------------------------------------------------------------------