the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
at main 1962 lines 67 kB view raw
1//------------------------------------------------------------------------------------- 2// DirectXMathConvert.inl -- SIMD C++ Math library 3// 4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF 5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO 6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A 7// PARTICULAR PURPOSE. 8// 9// Copyright (c) Microsoft Corporation. All rights reserved. 10//------------------------------------------------------------------------------------- 11 12#ifdef _MSC_VER 13#pragma once 14#endif 15 16/**************************************************************************** 17 * 18 * Data conversion 19 * 20 ****************************************************************************/ 21 22//------------------------------------------------------------------------------ 23 24#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 25// For VMX128, these routines are all defines in the main header 26 27#pragma warning(push) 28#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized 29 30inline XMVECTOR XMConvertVectorIntToFloat 31( 32 FXMVECTOR VInt, 33 uint32_t DivExponent 34) 35{ 36 assert(DivExponent<32); 37#if defined(_XM_NO_INTRINSICS_) 38 float fScale = 1.0f / (float)(1U << DivExponent); 39 uint32_t ElementIndex = 0; 40 XMVECTOR Result; 41 do { 42 int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex]; 43 Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale; 44 } while (++ElementIndex<4); 45 return Result; 46#elif defined(_XM_ARM_NEON_INTRINSICS_) 47 __n128 vResult = vcvtq_f32_s32( VInt ); 48 uint32_t uScale = 0x3F800000U - (DivExponent << 23); 49 __n128 vScale = vdupq_n_u32( uScale ); 50 return vmulq_f32( vResult, vScale ); 51#else // _XM_SSE_INTRINSICS_ 52 // Convert to floats 53 XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt)); 54 // Convert DivExponent into 1.0f/(1<<DivExponent) 55 uint32_t uScale = 0x3F800000U - (DivExponent << 23); 56 // Splat the scalar value 57 __m128i vScale = _mm_set1_epi32(uScale); 58 vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale)); 59 return vResult; 60#endif 61} 62 63//------------------------------------------------------------------------------ 64 65inline XMVECTOR XMConvertVectorFloatToInt 66( 67 FXMVECTOR VFloat, 68 uint32_t MulExponent 69) 70{ 71 assert(MulExponent<32); 72#if defined(_XM_NO_INTRINSICS_) 73 // Get the scalar factor. 74 float fScale = (float)(1U << MulExponent); 75 uint32_t ElementIndex = 0; 76 XMVECTOR Result; 77 do { 78 int32_t iResult; 79 float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; 80 if (fTemp <= -(65536.0f*32768.0f)) { 81 iResult = (-0x7FFFFFFF)-1; 82 } else if (fTemp > (65536.0f*32768.0f)-128.0f) { 83 iResult = 0x7FFFFFFF; 84 } else { 85 iResult = (int32_t)fTemp; 86 } 87 Result.vector4_u32[ElementIndex] = (uint32_t)iResult; 88 } while (++ElementIndex<4); 89 return Result; 90#elif defined(_XM_ARM_NEON_INTRINSICS_) 91 __n128 vResult = vdupq_n_f32((float)(1U << MulExponent)); 92 vResult = vmulq_f32(vResult,VFloat); 93 // In case of positive overflow, detect it 94 __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxInt); 95 // Float to int conversion 96 __n128 vResulti = vcvtq_s32_f32(vResult); 97 // If there was positive overflow, set to 0x7FFFFFFF 98 vResult = vandq_u32(vOverflow,g_XMAbsMask); 99 vOverflow = vbicq_u32(vResulti,vOverflow); 100 vOverflow = vorrq_u32(vOverflow,vResult); 101 return vOverflow; 102#else // _XM_SSE_INTRINSICS_ 103 XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent)); 104 vResult = _mm_mul_ps(vResult,VFloat); 105 // In case of positive overflow, detect it 106 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt); 107 // Float to int conversion 108 __m128i vResulti = _mm_cvttps_epi32(vResult); 109 // If there was positive overflow, set to 0x7FFFFFFF 110 vResult = _mm_and_ps(vOverflow,g_XMAbsMask); 111 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); 112 vOverflow = _mm_or_ps(vOverflow,vResult); 113 return vOverflow; 114#endif 115} 116 117//------------------------------------------------------------------------------ 118 119inline XMVECTOR XMConvertVectorUIntToFloat 120( 121 FXMVECTOR VUInt, 122 uint32_t DivExponent 123) 124{ 125 assert(DivExponent<32); 126#if defined(_XM_NO_INTRINSICS_) 127 float fScale = 1.0f / (float)(1U << DivExponent); 128 uint32_t ElementIndex = 0; 129 XMVECTOR Result; 130 do { 131 Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale; 132 } while (++ElementIndex<4); 133 return Result; 134#elif defined(_XM_ARM_NEON_INTRINSICS_) 135 __n128 vResult = vcvtq_f32_u32( VUInt ); 136 uint32_t uScale = 0x3F800000U - (DivExponent << 23); 137 __n128 vScale = vdupq_n_u32( uScale ); 138 return vmulq_f32( vResult, vScale ); 139#else // _XM_SSE_INTRINSICS_ 140 // For the values that are higher than 0x7FFFFFFF, a fixup is needed 141 // Determine which ones need the fix. 142 XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero); 143 // Force all values positive 144 XMVECTOR vResult = _mm_xor_ps(VUInt,vMask); 145 // Convert to floats 146 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); 147 // Convert 0x80000000 -> 0xFFFFFFFF 148 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); 149 // For only the ones that are too big, add the fixup 150 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); 151 vResult = _mm_add_ps(vResult,vMask); 152 // Convert DivExponent into 1.0f/(1<<DivExponent) 153 uint32_t uScale = 0x3F800000U - (DivExponent << 23); 154 // Splat 155 iMask = _mm_set1_epi32(uScale); 156 vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask)); 157 return vResult; 158#endif 159} 160 161//------------------------------------------------------------------------------ 162 163inline XMVECTOR XMConvertVectorFloatToUInt 164( 165 FXMVECTOR VFloat, 166 uint32_t MulExponent 167) 168{ 169 assert(MulExponent<32); 170#if defined(_XM_NO_INTRINSICS_) 171 // Get the scalar factor. 172 float fScale = (float)(1U << MulExponent); 173 uint32_t ElementIndex = 0; 174 XMVECTOR Result; 175 do { 176 uint32_t uResult; 177 float fTemp = VFloat.vector4_f32[ElementIndex]*fScale; 178 if (fTemp <= 0.0f) { 179 uResult = 0; 180 } else if (fTemp >= (65536.0f*65536.0f)) { 181 uResult = 0xFFFFFFFFU; 182 } else { 183 uResult = (uint32_t)fTemp; 184 } 185 Result.vector4_u32[ElementIndex] = uResult; 186 } while (++ElementIndex<4); 187 return Result; 188#elif defined(_XM_ARM_NEON_INTRINSICS_) 189 __n128 vResult = vdupq_n_f32((float)(1U << MulExponent)); 190 vResult = vmulq_f32(vResult,VFloat); 191 // In case of overflow, detect it 192 __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxUInt); 193 // Float to int conversion 194 __n128 vResulti = vcvtq_u32_f32(vResult); 195 // If there was overflow, set to 0xFFFFFFFFU 196 vResult = vbicq_u32(vResulti,vOverflow); 197 vOverflow = vorrq_u32(vOverflow,vResult); 198 return vOverflow; 199#else // _XM_SSE_INTRINSICS_ 200 XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent)); 201 vResult = _mm_mul_ps(vResult,VFloat); 202 // Clamp to >=0 203 vResult = _mm_max_ps(vResult,g_XMZero); 204 // Any numbers that are too big, set to 0xFFFFFFFFU 205 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); 206 XMVECTOR vValue = g_XMUnsignedFix; 207 // Too large for a signed integer? 208 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); 209 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise 210 vValue = _mm_and_ps(vValue,vMask); 211 // Perform fixup only on numbers too large (Keeps low bit precision) 212 vResult = _mm_sub_ps(vResult,vValue); 213 __m128i vResulti = _mm_cvttps_epi32(vResult); 214 // Convert from signed to unsigned pnly if greater than 0x80000000 215 vMask = _mm_and_ps(vMask,g_XMNegativeZero); 216 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); 217 // On those that are too large, set to 0xFFFFFFFF 218 vResult = _mm_or_ps(vResult,vOverflow); 219 return vResult; 220#endif 221} 222 223#pragma warning(pop) 224 225#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_ 226 227/**************************************************************************** 228 * 229 * Vector and matrix load operations 230 * 231 ****************************************************************************/ 232 233//------------------------------------------------------------------------------ 234_Use_decl_annotations_ 235inline XMVECTOR XMLoadInt(const uint32_t* pSource) 236{ 237 assert(pSource); 238#if defined(_XM_NO_INTRINSICS_) 239 XMVECTOR V; 240 V.vector4_u32[0] = *pSource; 241 V.vector4_u32[1] = 0; 242 V.vector4_u32[2] = 0; 243 V.vector4_u32[3] = 0; 244 return V; 245#elif defined(_XM_ARM_NEON_INTRINSICS_) 246 __n128 zero = vdupq_n_u32(0); 247 return vld1q_lane_u32( pSource, zero, 0 ); 248#elif defined(_XM_SSE_INTRINSICS_) 249 return _mm_load_ss( reinterpret_cast<const float*>(pSource) ); 250#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 251#endif // _XM_VMX128_INTRINSICS_ 252} 253 254//------------------------------------------------------------------------------ 255_Use_decl_annotations_ 256inline XMVECTOR XMLoadFloat(const float* pSource) 257{ 258 assert(pSource); 259#if defined(_XM_NO_INTRINSICS_) 260 XMVECTOR V; 261 V.vector4_f32[0] = *pSource; 262 V.vector4_f32[1] = 0.f; 263 V.vector4_f32[2] = 0.f; 264 V.vector4_f32[3] = 0.f; 265 return V; 266#elif defined(_XM_ARM_NEON_INTRINSICS_) 267 __n128 zero = vdupq_n_u32(0); 268 return vld1q_lane_f32( pSource, zero, 0 ); 269#elif defined(_XM_SSE_INTRINSICS_) 270 return _mm_load_ss( pSource ); 271#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 272#endif // _XM_VMX128_INTRINSICS_ 273} 274 275//------------------------------------------------------------------------------ 276_Use_decl_annotations_ 277inline XMVECTOR XMLoadInt2 278( 279 const uint32_t* pSource 280) 281{ 282 assert(pSource); 283#if defined(_XM_NO_INTRINSICS_) 284 XMVECTOR V; 285 V.vector4_u32[0] = pSource[0]; 286 V.vector4_u32[1] = pSource[1]; 287 V.vector4_u32[2] = 0; 288 V.vector4_u32[3] = 0; 289 return V; 290#elif defined(_XM_ARM_NEON_INTRINSICS_) 291 __n64 x = vld1_u32( pSource ); 292 __n64 zero = vdup_n_u32(0); 293 return vcombine_u32( x, zero ); 294#elif defined(_XM_SSE_INTRINSICS_) 295 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) ); 296 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) ); 297 return _mm_unpacklo_ps( x, y ); 298#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 299#endif // _XM_VMX128_INTRINSICS_ 300} 301 302//------------------------------------------------------------------------------ 303_Use_decl_annotations_ 304inline XMVECTOR XMLoadInt2A 305( 306 const uint32_t* pSource 307) 308{ 309 assert(pSource); 310 assert(((uintptr_t)pSource & 0xF) == 0); 311#if defined(_XM_NO_INTRINSICS_) 312 XMVECTOR V; 313 V.vector4_u32[0] = pSource[0]; 314 V.vector4_u32[1] = pSource[1]; 315 V.vector4_u32[2] = 0; 316 V.vector4_u32[3] = 0; 317 return V; 318#elif defined(_XM_ARM_NEON_INTRINSICS_) 319 __n64 x = vld1_u32_ex( pSource, 64 ); 320 __n64 zero = vdup_n_u32(0); 321 return vcombine_u32( x, zero ); 322#elif defined(_XM_SSE_INTRINSICS_) 323 __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) ); 324 return reinterpret_cast<__m128 *>(&V)[0]; 325#else // _XM_VMX128_INTRINSICS_ 326#endif // _XM_VMX128_INTRINSICS_ 327} 328 329//------------------------------------------------------------------------------ 330_Use_decl_annotations_ 331inline XMVECTOR XMLoadFloat2 332( 333 const XMFLOAT2* pSource 334) 335{ 336 assert(pSource); 337#if defined(_XM_NO_INTRINSICS_) 338 XMVECTOR V; 339 V.vector4_f32[0] = pSource->x; 340 V.vector4_f32[1] = pSource->y; 341 V.vector4_f32[2] = 0.f; 342 V.vector4_f32[3] = 0.f; 343 return V; 344#elif defined(_XM_ARM_NEON_INTRINSICS_) 345 __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) ); 346 __n64 zero = vdup_n_u32(0); 347 return vcombine_f32( x, zero ); 348#elif defined(_XM_SSE_INTRINSICS_) 349 __m128 x = _mm_load_ss( &pSource->x ); 350 __m128 y = _mm_load_ss( &pSource->y ); 351 return _mm_unpacklo_ps( x, y ); 352#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 353#endif // _XM_VMX128_INTRINSICS_ 354} 355 356//------------------------------------------------------------------------------ 357_Use_decl_annotations_ 358inline XMVECTOR XMLoadFloat2A 359( 360 const XMFLOAT2A* pSource 361) 362{ 363 assert(pSource); 364 assert(((uintptr_t)pSource & 0xF) == 0); 365#if defined(_XM_NO_INTRINSICS_) 366 XMVECTOR V; 367 V.vector4_f32[0] = pSource->x; 368 V.vector4_f32[1] = pSource->y; 369 V.vector4_f32[2] = 0.f; 370 V.vector4_f32[3] = 0.f; 371 return V; 372#elif defined(_XM_ARM_NEON_INTRINSICS_) 373 __n64 x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 ); 374 __n64 zero = vdup_n_u32(0); 375 return vcombine_f32( x, zero ); 376#elif defined(_XM_SSE_INTRINSICS_) 377 __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) ); 378 return reinterpret_cast<__m128 *>(&V)[0]; 379#else // _XM_VMX128_INTRINSICS_ 380#endif // _XM_VMX128_INTRINSICS_ 381} 382 383//------------------------------------------------------------------------------ 384_Use_decl_annotations_ 385inline XMVECTOR XMLoadSInt2 386( 387 const XMINT2* pSource 388) 389{ 390 assert(pSource); 391#if defined(_XM_NO_INTRINSICS_) 392 XMVECTOR V; 393 V.vector4_f32[0] = (float)pSource->x; 394 V.vector4_f32[1] = (float)pSource->y; 395 V.vector4_f32[2] = 0.f; 396 V.vector4_f32[3] = 0.f; 397 return V; 398#elif defined(_XM_ARM_NEON_INTRINSICS_) 399 __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) ); 400 __n64 v = vcvt_f32_s32( x ); 401 __n64 zero = vdup_n_u32(0); 402 return vcombine_s32( v, zero ); 403#elif defined(_XM_SSE_INTRINSICS_) 404 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); 405 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); 406 __m128 V = _mm_unpacklo_ps( x, y ); 407 return _mm_cvtepi32_ps(_mm_castps_si128(V)); 408#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 409#endif // _XM_VMX128_INTRINSICS_ 410} 411 412//------------------------------------------------------------------------------ 413_Use_decl_annotations_ 414inline XMVECTOR XMLoadUInt2 415( 416 const XMUINT2* pSource 417) 418{ 419 assert(pSource); 420#if defined(_XM_NO_INTRINSICS_) 421 XMVECTOR V; 422 V.vector4_f32[0] = (float)pSource->x; 423 V.vector4_f32[1] = (float)pSource->y; 424 V.vector4_f32[2] = 0.f; 425 V.vector4_f32[3] = 0.f; 426 return V; 427#elif defined(_XM_ARM_NEON_INTRINSICS_) 428 __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) ); 429 __n64 v = vcvt_f32_u32( x ); 430 __n64 zero = vdup_n_u32(0); 431 return vcombine_u32( v, zero ); 432#elif defined(_XM_SSE_INTRINSICS_) 433 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); 434 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); 435 __m128 V = _mm_unpacklo_ps( x, y ); 436 // For the values that are higher than 0x7FFFFFFF, a fixup is needed 437 // Determine which ones need the fix. 438 XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); 439 // Force all values positive 440 XMVECTOR vResult = _mm_xor_ps(V,vMask); 441 // Convert to floats 442 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); 443 // Convert 0x80000000 -> 0xFFFFFFFF 444 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); 445 // For only the ones that are too big, add the fixup 446 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); 447 vResult = _mm_add_ps(vResult,vMask); 448 return vResult; 449#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 450#endif // _XM_VMX128_INTRINSICS_ 451} 452 453//------------------------------------------------------------------------------ 454_Use_decl_annotations_ 455inline XMVECTOR XMLoadInt3 456( 457 const uint32_t* pSource 458) 459{ 460 assert(pSource); 461#if defined(_XM_NO_INTRINSICS_) 462 XMVECTOR V; 463 V.vector4_u32[0] = pSource[0]; 464 V.vector4_u32[1] = pSource[1]; 465 V.vector4_u32[2] = pSource[2]; 466 V.vector4_u32[3] = 0; 467 return V; 468#elif defined(_XM_ARM_NEON_INTRINSICS_) 469 __n64 x = vld1_u32( pSource ); 470 __n64 zero = vdup_n_u32(0); 471 __n64 y = vld1_lane_u32( pSource+2, zero, 0 ); 472 return vcombine_u32( x, y ); 473#elif defined(_XM_SSE_INTRINSICS_) 474 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) ); 475 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) ); 476 __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) ); 477 __m128 xy = _mm_unpacklo_ps( x, y ); 478 return _mm_movelh_ps( xy, z ); 479#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 480#endif // _XM_VMX128_INTRINSICS_ 481} 482 483//------------------------------------------------------------------------------ 484_Use_decl_annotations_ 485inline XMVECTOR XMLoadInt3A 486( 487 const uint32_t* pSource 488) 489{ 490 assert(pSource); 491 assert(((uintptr_t)pSource & 0xF) == 0); 492#if defined(_XM_NO_INTRINSICS_) 493 XMVECTOR V; 494 V.vector4_u32[0] = pSource[0]; 495 V.vector4_u32[1] = pSource[1]; 496 V.vector4_u32[2] = pSource[2]; 497 V.vector4_u32[3] = 0; 498 return V; 499#elif defined(_XM_ARM_NEON_INTRINSICS_) 500 // Reads an extra integer which is zero'd 501 __n128 V = vld1q_u32_ex( pSource, 128 ); 502 return vsetq_lane_u32( 0, V, 3 ); 503#elif defined(_XM_SSE_INTRINSICS_) 504 // Reads an extra integer which is zero'd 505 __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) ); 506 V = _mm_and_si128( V, g_XMMask3 ); 507 return reinterpret_cast<__m128 *>(&V)[0]; 508#else // _XM_VMX128_INTRINSICS_ 509#endif // _XM_VMX128_INTRINSICS_ 510} 511 512//------------------------------------------------------------------------------ 513_Use_decl_annotations_ 514inline XMVECTOR XMLoadFloat3 515( 516 const XMFLOAT3* pSource 517) 518{ 519 assert(pSource); 520#if defined(_XM_NO_INTRINSICS_) 521 XMVECTOR V; 522 V.vector4_f32[0] = pSource->x; 523 V.vector4_f32[1] = pSource->y; 524 V.vector4_f32[2] = pSource->z; 525 V.vector4_f32[3] = 0.f; 526 return V; 527#elif defined(_XM_ARM_NEON_INTRINSICS_) 528 __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) ); 529 __n64 zero = vdup_n_u32(0); 530 __n64 y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 ); 531 return vcombine_f32( x, y ); 532#elif defined(_XM_SSE_INTRINSICS_) 533 __m128 x = _mm_load_ss( &pSource->x ); 534 __m128 y = _mm_load_ss( &pSource->y ); 535 __m128 z = _mm_load_ss( &pSource->z ); 536 __m128 xy = _mm_unpacklo_ps( x, y ); 537 return _mm_movelh_ps( xy, z ); 538#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 539#endif // _XM_VMX128_INTRINSICS_ 540} 541 542//------------------------------------------------------------------------------ 543_Use_decl_annotations_ 544inline XMVECTOR XMLoadFloat3A 545( 546 const XMFLOAT3A* pSource 547) 548{ 549 assert(pSource); 550 assert(((uintptr_t)pSource & 0xF) == 0); 551#if defined(_XM_NO_INTRINSICS_) 552 XMVECTOR V; 553 V.vector4_f32[0] = pSource->x; 554 V.vector4_f32[1] = pSource->y; 555 V.vector4_f32[2] = pSource->z; 556 V.vector4_f32[3] = 0.f; 557 return V; 558#elif defined(_XM_ARM_NEON_INTRINSICS_) 559 // Reads an extra float which is zero'd 560 __n128 V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 ); 561 return vsetq_lane_f32( 0, V, 3 ); 562#elif defined(_XM_SSE_INTRINSICS_) 563 // Reads an extra float which is zero'd 564 __m128 V = _mm_load_ps( &pSource->x ); 565 return _mm_and_ps( V, g_XMMask3 ); 566#else // _XM_VMX128_INTRINSICS_ 567#endif // _XM_VMX128_INTRINSICS_ 568} 569 570//------------------------------------------------------------------------------ 571_Use_decl_annotations_ 572inline XMVECTOR XMLoadSInt3 573( 574 const XMINT3* pSource 575) 576{ 577 assert(pSource); 578#if defined(_XM_NO_INTRINSICS_) 579 580 XMVECTOR V; 581 V.vector4_f32[0] = (float)pSource->x; 582 V.vector4_f32[1] = (float)pSource->y; 583 V.vector4_f32[2] = (float)pSource->z; 584 V.vector4_f32[3] = 0.f; 585 return V; 586 587#elif defined(_XM_ARM_NEON_INTRINSICS_) 588 __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) ); 589 __n64 zero = vdup_n_u32(0); 590 __n64 y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 ); 591 __n128 v = vcombine_s32( x, y ); 592 return vcvtq_f32_s32( v ); 593#elif defined(_XM_SSE_INTRINSICS_) 594 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); 595 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); 596 __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) ); 597 __m128 xy = _mm_unpacklo_ps( x, y ); 598 __m128 V = _mm_movelh_ps( xy, z ); 599 return _mm_cvtepi32_ps(_mm_castps_si128(V)); 600#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 601#endif // _XM_VMX128_INTRINSICS_ 602} 603 604//------------------------------------------------------------------------------ 605_Use_decl_annotations_ 606inline XMVECTOR XMLoadUInt3 607( 608 const XMUINT3* pSource 609) 610{ 611 assert(pSource); 612#if defined(_XM_NO_INTRINSICS_) 613 XMVECTOR V; 614 V.vector4_f32[0] = (float)pSource->x; 615 V.vector4_f32[1] = (float)pSource->y; 616 V.vector4_f32[2] = (float)pSource->z; 617 V.vector4_f32[3] = 0.f; 618 return V; 619#elif defined(_XM_ARM_NEON_INTRINSICS_) 620 __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) ); 621 __n64 zero = vdup_n_u32(0); 622 __n64 y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 ); 623 __n128 v = vcombine_u32( x, y ); 624 return vcvtq_f32_u32( v ); 625#elif defined(_XM_SSE_INTRINSICS_) 626 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) ); 627 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) ); 628 __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) ); 629 __m128 xy = _mm_unpacklo_ps( x, y ); 630 __m128 V = _mm_movelh_ps( xy, z ); 631 // For the values that are higher than 0x7FFFFFFF, a fixup is needed 632 // Determine which ones need the fix. 633 XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero); 634 // Force all values positive 635 XMVECTOR vResult = _mm_xor_ps(V,vMask); 636 // Convert to floats 637 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); 638 // Convert 0x80000000 -> 0xFFFFFFFF 639 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); 640 // For only the ones that are too big, add the fixup 641 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); 642 vResult = _mm_add_ps(vResult,vMask); 643 return vResult; 644 645#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 646#endif // _XM_VMX128_INTRINSICS_ 647} 648 649//------------------------------------------------------------------------------ 650_Use_decl_annotations_ 651inline XMVECTOR XMLoadInt4 652( 653 const uint32_t* pSource 654) 655{ 656 assert(pSource); 657 658#if defined(_XM_NO_INTRINSICS_) 659 XMVECTOR V; 660 V.vector4_u32[0] = pSource[0]; 661 V.vector4_u32[1] = pSource[1]; 662 V.vector4_u32[2] = pSource[2]; 663 V.vector4_u32[3] = pSource[3]; 664 return V; 665#elif defined(_XM_ARM_NEON_INTRINSICS_) 666 return vld1q_u32( pSource ); 667#elif defined(_XM_SSE_INTRINSICS_) 668 __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); 669 return reinterpret_cast<__m128 *>(&V)[0]; 670#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 671#endif // _XM_VMX128_INTRINSICS_ 672} 673 674//------------------------------------------------------------------------------ 675_Use_decl_annotations_ 676inline XMVECTOR XMLoadInt4A 677( 678 const uint32_t* pSource 679) 680{ 681 assert(pSource); 682 assert(((uintptr_t)pSource & 0xF) == 0); 683#if defined(_XM_NO_INTRINSICS_) 684 XMVECTOR V; 685 V.vector4_u32[0] = pSource[0]; 686 V.vector4_u32[1] = pSource[1]; 687 V.vector4_u32[2] = pSource[2]; 688 V.vector4_u32[3] = pSource[3]; 689 return V; 690#elif defined(_XM_ARM_NEON_INTRINSICS_) 691 return vld1q_u32_ex( pSource, 128 ); 692#elif defined(_XM_SSE_INTRINSICS_) 693 __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) ); 694 return reinterpret_cast<__m128 *>(&V)[0]; 695#else // _XM_VMX128_INTRINSICS_ 696#endif // _XM_VMX128_INTRINSICS_ 697} 698 699//------------------------------------------------------------------------------ 700_Use_decl_annotations_ 701inline XMVECTOR XMLoadFloat4 702( 703 const XMFLOAT4* pSource 704) 705{ 706 assert(pSource); 707#if defined(_XM_NO_INTRINSICS_) 708 XMVECTOR V; 709 V.vector4_f32[0] = pSource->x; 710 V.vector4_f32[1] = pSource->y; 711 V.vector4_f32[2] = pSource->z; 712 V.vector4_f32[3] = pSource->w; 713 return V; 714#elif defined(_XM_ARM_NEON_INTRINSICS_) 715 return vld1q_f32( reinterpret_cast<const float*>(pSource) ); 716#elif defined(_XM_SSE_INTRINSICS_) 717 return _mm_loadu_ps( &pSource->x ); 718#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 719#endif // _XM_VMX128_INTRINSICS_ 720} 721 722//------------------------------------------------------------------------------ 723_Use_decl_annotations_ 724inline XMVECTOR XMLoadFloat4A 725( 726 const XMFLOAT4A* pSource 727) 728{ 729 assert(pSource); 730 assert(((uintptr_t)pSource & 0xF) == 0); 731#if defined(_XM_NO_INTRINSICS_) 732 XMVECTOR V; 733 V.vector4_f32[0] = pSource->x; 734 V.vector4_f32[1] = pSource->y; 735 V.vector4_f32[2] = pSource->z; 736 V.vector4_f32[3] = pSource->w; 737 return V; 738#elif defined(_XM_ARM_NEON_INTRINSICS_) 739 return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 ); 740#elif defined(_XM_SSE_INTRINSICS_) 741 return _mm_load_ps( &pSource->x ); 742#else // _XM_VMX128_INTRINSICS_ 743#endif // _XM_VMX128_INTRINSICS_ 744} 745 746//------------------------------------------------------------------------------ 747_Use_decl_annotations_ 748inline XMVECTOR XMLoadSInt4 749( 750 const XMINT4* pSource 751) 752{ 753 assert(pSource); 754#if defined(_XM_NO_INTRINSICS_) 755 756 XMVECTOR V; 757 V.vector4_f32[0] = (float)pSource->x; 758 V.vector4_f32[1] = (float)pSource->y; 759 V.vector4_f32[2] = (float)pSource->z; 760 V.vector4_f32[3] = (float)pSource->w; 761 return V; 762 763#elif defined(_XM_ARM_NEON_INTRINSICS_) 764 __n128 v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) ); 765 return vcvtq_f32_s32( v ); 766#elif defined(_XM_SSE_INTRINSICS_) 767 __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); 768 return _mm_cvtepi32_ps(V); 769#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 770#endif // _XM_VMX128_INTRINSICS_ 771} 772 773//------------------------------------------------------------------------------ 774_Use_decl_annotations_ 775inline XMVECTOR XMLoadUInt4 776( 777 const XMUINT4* pSource 778) 779{ 780 assert(pSource); 781#if defined(_XM_NO_INTRINSICS_) 782 XMVECTOR V; 783 V.vector4_f32[0] = (float)pSource->x; 784 V.vector4_f32[1] = (float)pSource->y; 785 V.vector4_f32[2] = (float)pSource->z; 786 V.vector4_f32[3] = (float)pSource->w; 787 return V; 788#elif defined(_XM_ARM_NEON_INTRINSICS_) 789 __n128 v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) ); 790 return vcvtq_f32_u32( v ); 791#elif defined(_XM_SSE_INTRINSICS_) 792 __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) ); 793 // For the values that are higher than 0x7FFFFFFF, a fixup is needed 794 // Determine which ones need the fix. 795 XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero); 796 // Force all values positive 797 XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask); 798 // Convert to floats 799 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult)); 800 // Convert 0x80000000 -> 0xFFFFFFFF 801 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31); 802 // For only the ones that are too big, add the fixup 803 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned); 804 vResult = _mm_add_ps(vResult,vMask); 805 return vResult; 806#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 807#endif // _XM_VMX128_INTRINSICS_ 808} 809 810//------------------------------------------------------------------------------ 811_Use_decl_annotations_ 812inline XMMATRIX XMLoadFloat3x3 813( 814 const XMFLOAT3X3* pSource 815) 816{ 817 assert(pSource); 818#if defined(_XM_NO_INTRINSICS_) 819 820 XMMATRIX M; 821 M.r[0].vector4_f32[0] = pSource->m[0][0]; 822 M.r[0].vector4_f32[1] = pSource->m[0][1]; 823 M.r[0].vector4_f32[2] = pSource->m[0][2]; 824 M.r[0].vector4_f32[3] = 0.0f; 825 826 M.r[1].vector4_f32[0] = pSource->m[1][0]; 827 M.r[1].vector4_f32[1] = pSource->m[1][1]; 828 M.r[1].vector4_f32[2] = pSource->m[1][2]; 829 M.r[1].vector4_f32[3] = 0.0f; 830 831 M.r[2].vector4_f32[0] = pSource->m[2][0]; 832 M.r[2].vector4_f32[1] = pSource->m[2][1]; 833 M.r[2].vector4_f32[2] = pSource->m[2][2]; 834 M.r[2].vector4_f32[3] = 0.0f; 835 M.r[3].vector4_f32[0] = 0.0f; 836 M.r[3].vector4_f32[1] = 0.0f; 837 M.r[3].vector4_f32[2] = 0.0f; 838 M.r[3].vector4_f32[3] = 1.0f; 839 return M; 840 841#elif defined(_XM_ARM_NEON_INTRINSICS_) 842 __n128 v0 = vld1q_f32( &pSource->m[0][0] ); 843 __n128 v1 = vld1q_f32( &pSource->m[1][1] ); 844 __n64 v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] ); 845 __n128 T = vextq_f32( v0, v1, 3 ); 846 847 XMMATRIX M; 848 M.r[0] = vandq_u32( v0, g_XMMask3 ); 849 M.r[1] = vandq_u32( T, g_XMMask3 ); 850 M.r[2] = vcombine_f32( vget_high_f32(v1), v2 ); 851 M.r[3] = g_XMIdentityR3; 852 return M; 853#elif defined(_XM_SSE_INTRINSICS_) 854 __m128 Z = _mm_setzero_ps(); 855 856 __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] ); 857 __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] ); 858 __m128 V3 = _mm_load_ss( &pSource->m[2][2] ); 859 860 __m128 T1 = _mm_unpackhi_ps( V1, Z ); 861 __m128 T2 = _mm_unpacklo_ps( V2, Z ); 862 __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) ); 863 __m128 T4 = _mm_movehl_ps( T2, T3 ); 864 __m128 T5 = _mm_movehl_ps( Z, T1 ); 865 866 XMMATRIX M; 867 M.r[0] = _mm_movelh_ps( V1, T1 ); 868 M.r[1] = _mm_add_ps( T4, T5 ); 869 M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) ); 870 M.r[3] = g_XMIdentityR3; 871 return M; 872#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 873#endif // _XM_VMX128_INTRINSICS_ 874} 875 876//------------------------------------------------------------------------------ 877_Use_decl_annotations_ 878inline XMMATRIX XMLoadFloat4x3 879( 880 const XMFLOAT4X3* pSource 881) 882{ 883 assert(pSource); 884#if defined(_XM_NO_INTRINSICS_) 885 886 XMMATRIX M; 887 M.r[0].vector4_f32[0] = pSource->m[0][0]; 888 M.r[0].vector4_f32[1] = pSource->m[0][1]; 889 M.r[0].vector4_f32[2] = pSource->m[0][2]; 890 M.r[0].vector4_f32[3] = 0.0f; 891 892 M.r[1].vector4_f32[0] = pSource->m[1][0]; 893 M.r[1].vector4_f32[1] = pSource->m[1][1]; 894 M.r[1].vector4_f32[2] = pSource->m[1][2]; 895 M.r[1].vector4_f32[3] = 0.0f; 896 897 M.r[2].vector4_f32[0] = pSource->m[2][0]; 898 M.r[2].vector4_f32[1] = pSource->m[2][1]; 899 M.r[2].vector4_f32[2] = pSource->m[2][2]; 900 M.r[2].vector4_f32[3] = 0.0f; 901 902 M.r[3].vector4_f32[0] = pSource->m[3][0]; 903 M.r[3].vector4_f32[1] = pSource->m[3][1]; 904 M.r[3].vector4_f32[2] = pSource->m[3][2]; 905 M.r[3].vector4_f32[3] = 1.0f; 906 return M; 907 908#elif defined(_XM_ARM_NEON_INTRINSICS_) 909 __n128 v0 = vld1q_f32( &pSource->m[0][0] ); 910 __n128 v1 = vld1q_f32( &pSource->m[1][1] ); 911 __n128 v2 = vld1q_f32( &pSource->m[2][2] ); 912 913 __n128 T1 = vextq_f32( v0, v1, 3 ); 914 __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); 915 __n128 T3 = vextq_f32( v2, v2, 1 ); 916 917 XMMATRIX M; 918 M.r[0] = vandq_u32( v0, g_XMMask3 ); 919 M.r[1] = vandq_u32( T1, g_XMMask3 ); 920 M.r[2] = vandq_u32( T2, g_XMMask3 ); 921 M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); 922 return M; 923#elif defined(_XM_SSE_INTRINSICS_) 924 // Use unaligned load instructions to 925 // load the 12 floats 926 // vTemp1 = x1,y1,z1,x2 927 XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]); 928 // vTemp2 = y2,z2,x3,y3 929 XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]); 930 // vTemp4 = z3,x4,y4,z4 931 XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]); 932 // vTemp3 = x3,y3,z3,z3 933 XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); 934 // vTemp2 = y2,z2,x2,x2 935 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); 936 // vTemp2 = x2,y2,z2,z2 937 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); 938 // vTemp1 = x1,y1,z1,0 939 vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); 940 // vTemp2 = x2,y2,z2,0 941 vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); 942 // vTemp3 = x3,y3,z3,0 943 vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); 944 // vTemp4i = x4,y4,z4,0 945 __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); 946 // vTemp4i = x4,y4,z4,1.0f 947 vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); 948 XMMATRIX M(vTemp1, 949 vTemp2, 950 vTemp3, 951 _mm_castsi128_ps(vTemp4i)); 952 return M; 953#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 954#endif // _XM_VMX128_INTRINSICS_ 955} 956 957//------------------------------------------------------------------------------ 958_Use_decl_annotations_ 959inline XMMATRIX XMLoadFloat4x3A 960( 961 const XMFLOAT4X3A* pSource 962) 963{ 964 assert(pSource); 965 assert(((uintptr_t)pSource & 0xF) == 0); 966#if defined(_XM_NO_INTRINSICS_) 967 968 XMMATRIX M; 969 M.r[0].vector4_f32[0] = pSource->m[0][0]; 970 M.r[0].vector4_f32[1] = pSource->m[0][1]; 971 M.r[0].vector4_f32[2] = pSource->m[0][2]; 972 M.r[0].vector4_f32[3] = 0.0f; 973 974 M.r[1].vector4_f32[0] = pSource->m[1][0]; 975 M.r[1].vector4_f32[1] = pSource->m[1][1]; 976 M.r[1].vector4_f32[2] = pSource->m[1][2]; 977 M.r[1].vector4_f32[3] = 0.0f; 978 979 M.r[2].vector4_f32[0] = pSource->m[2][0]; 980 M.r[2].vector4_f32[1] = pSource->m[2][1]; 981 M.r[2].vector4_f32[2] = pSource->m[2][2]; 982 M.r[2].vector4_f32[3] = 0.0f; 983 984 M.r[3].vector4_f32[0] = pSource->m[3][0]; 985 M.r[3].vector4_f32[1] = pSource->m[3][1]; 986 M.r[3].vector4_f32[2] = pSource->m[3][2]; 987 M.r[3].vector4_f32[3] = 1.0f; 988 return M; 989 990#elif defined(_XM_ARM_NEON_INTRINSICS_) 991 __n128 v0 = vld1q_f32_ex( &pSource->m[0][0], 128 ); 992 __n128 v1 = vld1q_f32_ex( &pSource->m[1][1], 128 ); 993 __n128 v2 = vld1q_f32_ex( &pSource->m[2][2], 128 ); 994 995 __n128 T1 = vextq_f32( v0, v1, 3 ); 996 __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) ); 997 __n128 T3 = vextq_f32( v2, v2, 1 ); 998 999 XMMATRIX M; 1000 M.r[0] = vandq_u32( v0, g_XMMask3 ); 1001 M.r[1] = vandq_u32( T1, g_XMMask3 ); 1002 M.r[2] = vandq_u32( T2, g_XMMask3 ); 1003 M.r[3] = vsetq_lane_f32( 1.f, T3, 3 ); 1004 return M; 1005#elif defined(_XM_SSE_INTRINSICS_) 1006 // Use aligned load instructions to 1007 // load the 12 floats 1008 // vTemp1 = x1,y1,z1,x2 1009 XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]); 1010 // vTemp2 = y2,z2,x3,y3 1011 XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]); 1012 // vTemp4 = z3,x4,y4,z4 1013 XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]); 1014 // vTemp3 = x3,y3,z3,z3 1015 XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2)); 1016 // vTemp2 = y2,z2,x2,x2 1017 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0)); 1018 // vTemp2 = x2,y2,z2,z2 1019 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2)); 1020 // vTemp1 = x1,y1,z1,0 1021 vTemp1 = _mm_and_ps(vTemp1,g_XMMask3); 1022 // vTemp2 = x2,y2,z2,0 1023 vTemp2 = _mm_and_ps(vTemp2,g_XMMask3); 1024 // vTemp3 = x3,y3,z3,0 1025 vTemp3 = _mm_and_ps(vTemp3,g_XMMask3); 1026 // vTemp4i = x4,y4,z4,0 1027 __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8); 1028 // vTemp4i = x4,y4,z4,1.0f 1029 vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3); 1030 XMMATRIX M(vTemp1, 1031 vTemp2, 1032 vTemp3, 1033 _mm_castsi128_ps(vTemp4i)); 1034 return M; 1035#else // _XM_VMX128_INTRINSICS_ 1036#endif // _XM_VMX128_INTRINSICS_ 1037} 1038 1039//------------------------------------------------------------------------------ 1040_Use_decl_annotations_ 1041inline XMMATRIX XMLoadFloat4x4 1042( 1043 const XMFLOAT4X4* pSource 1044) 1045{ 1046 assert(pSource); 1047#if defined(_XM_NO_INTRINSICS_) 1048 1049 XMMATRIX M; 1050 M.r[0].vector4_f32[0] = pSource->m[0][0]; 1051 M.r[0].vector4_f32[1] = pSource->m[0][1]; 1052 M.r[0].vector4_f32[2] = pSource->m[0][2]; 1053 M.r[0].vector4_f32[3] = pSource->m[0][3]; 1054 1055 M.r[1].vector4_f32[0] = pSource->m[1][0]; 1056 M.r[1].vector4_f32[1] = pSource->m[1][1]; 1057 M.r[1].vector4_f32[2] = pSource->m[1][2]; 1058 M.r[1].vector4_f32[3] = pSource->m[1][3]; 1059 1060 M.r[2].vector4_f32[0] = pSource->m[2][0]; 1061 M.r[2].vector4_f32[1] = pSource->m[2][1]; 1062 M.r[2].vector4_f32[2] = pSource->m[2][2]; 1063 M.r[2].vector4_f32[3] = pSource->m[2][3]; 1064 1065 M.r[3].vector4_f32[0] = pSource->m[3][0]; 1066 M.r[3].vector4_f32[1] = pSource->m[3][1]; 1067 M.r[3].vector4_f32[2] = pSource->m[3][2]; 1068 M.r[3].vector4_f32[3] = pSource->m[3][3]; 1069 return M; 1070 1071#elif defined(_XM_ARM_NEON_INTRINSICS_) 1072 XMMATRIX M; 1073 M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) ); 1074 M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) ); 1075 M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) ); 1076 M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) ); 1077 return M; 1078#elif defined(_XM_SSE_INTRINSICS_) 1079 XMMATRIX M; 1080 M.r[0] = _mm_loadu_ps( &pSource->_11 ); 1081 M.r[1] = _mm_loadu_ps( &pSource->_21 ); 1082 M.r[2] = _mm_loadu_ps( &pSource->_31 ); 1083 M.r[3] = _mm_loadu_ps( &pSource->_41 ); 1084 return M; 1085#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 1086#endif // _XM_VMX128_INTRINSICS_ 1087} 1088 1089//------------------------------------------------------------------------------ 1090_Use_decl_annotations_ 1091inline XMMATRIX XMLoadFloat4x4A 1092( 1093 const XMFLOAT4X4A* pSource 1094) 1095{ 1096 assert(pSource); 1097 assert(((uintptr_t)pSource & 0xF) == 0); 1098#if defined(_XM_NO_INTRINSICS_) 1099 1100 XMMATRIX M; 1101 M.r[0].vector4_f32[0] = pSource->m[0][0]; 1102 M.r[0].vector4_f32[1] = pSource->m[0][1]; 1103 M.r[0].vector4_f32[2] = pSource->m[0][2]; 1104 M.r[0].vector4_f32[3] = pSource->m[0][3]; 1105 1106 M.r[1].vector4_f32[0] = pSource->m[1][0]; 1107 M.r[1].vector4_f32[1] = pSource->m[1][1]; 1108 M.r[1].vector4_f32[2] = pSource->m[1][2]; 1109 M.r[1].vector4_f32[3] = pSource->m[1][3]; 1110 1111 M.r[2].vector4_f32[0] = pSource->m[2][0]; 1112 M.r[2].vector4_f32[1] = pSource->m[2][1]; 1113 M.r[2].vector4_f32[2] = pSource->m[2][2]; 1114 M.r[2].vector4_f32[3] = pSource->m[2][3]; 1115 1116 M.r[3].vector4_f32[0] = pSource->m[3][0]; 1117 M.r[3].vector4_f32[1] = pSource->m[3][1]; 1118 M.r[3].vector4_f32[2] = pSource->m[3][2]; 1119 M.r[3].vector4_f32[3] = pSource->m[3][3]; 1120 return M; 1121 1122#elif defined(_XM_ARM_NEON_INTRINSICS_) 1123 XMMATRIX M; 1124 M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 ); 1125 M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 ); 1126 M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 ); 1127 M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 ); 1128 return M; 1129#elif defined(_XM_SSE_INTRINSICS_) 1130 XMMATRIX M; 1131 M.r[0] = _mm_load_ps( &pSource->_11 ); 1132 M.r[1] = _mm_load_ps( &pSource->_21 ); 1133 M.r[2] = _mm_load_ps( &pSource->_31 ); 1134 M.r[3] = _mm_load_ps( &pSource->_41 ); 1135 return M; 1136#else // _XM_VMX128_INTRINSICS_ 1137#endif // _XM_VMX128_INTRINSICS_ 1138} 1139 1140/**************************************************************************** 1141 * 1142 * Vector and matrix store operations 1143 * 1144 ****************************************************************************/ 1145_Use_decl_annotations_ 1146inline void XMStoreInt 1147( 1148 uint32_t* pDestination, 1149 FXMVECTOR V 1150) 1151{ 1152 assert(pDestination); 1153#if defined(_XM_NO_INTRINSICS_) 1154 *pDestination = XMVectorGetIntX( V ); 1155#elif defined(_XM_ARM_NEON_INTRINSICS_) 1156 vst1q_lane_u32( pDestination, V, 0 ); 1157#elif defined(_XM_SSE_INTRINSICS_) 1158 _mm_store_ss( reinterpret_cast<float*>(pDestination), V ); 1159#else // _XM_VMX128_INTRINSICS_ 1160#endif // _XM_VMX128_INTRINSICS_ 1161} 1162 1163//------------------------------------------------------------------------------ 1164_Use_decl_annotations_ 1165inline void XMStoreFloat 1166( 1167 float* pDestination, 1168 FXMVECTOR V 1169) 1170{ 1171 assert(pDestination); 1172#if defined(_XM_NO_INTRINSICS_) 1173 *pDestination = XMVectorGetX( V ); 1174#elif defined(_XM_ARM_NEON_INTRINSICS_) 1175 vst1q_lane_f32( pDestination, V, 0 ); 1176#elif defined(_XM_SSE_INTRINSICS_) 1177 _mm_store_ss( pDestination, V ); 1178#else // _XM_VMX128_INTRINSICS_ 1179#endif // _XM_VMX128_INTRINSICS_ 1180} 1181 1182//------------------------------------------------------------------------------ 1183_Use_decl_annotations_ 1184inline void XMStoreInt2 1185( 1186 uint32_t* pDestination, 1187 FXMVECTOR V 1188) 1189{ 1190 assert(pDestination); 1191#if defined(_XM_NO_INTRINSICS_) 1192 pDestination[0] = V.vector4_u32[0]; 1193 pDestination[1] = V.vector4_u32[1]; 1194#elif defined(_XM_ARM_NEON_INTRINSICS_) 1195 __n64 VL = vget_low_u32(V); 1196 vst1_u32( pDestination, VL ); 1197#elif defined(_XM_SSE_INTRINSICS_) 1198 XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); 1199 _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V ); 1200 _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T ); 1201#else // _XM_VMX128_INTRINSICS_ 1202#endif // _XM_VMX128_INTRINSICS_ 1203} 1204 1205//------------------------------------------------------------------------------ 1206_Use_decl_annotations_ 1207inline void XMStoreInt2A 1208( 1209 uint32_t* pDestination, 1210 FXMVECTOR V 1211) 1212{ 1213 assert(pDestination); 1214 assert(((uintptr_t)pDestination & 0xF) == 0); 1215#if defined(_XM_NO_INTRINSICS_) 1216 pDestination[0] = V.vector4_u32[0]; 1217 pDestination[1] = V.vector4_u32[1]; 1218#elif defined(_XM_ARM_NEON_INTRINSICS_) 1219 __n64 VL = vget_low_u32(V); 1220 vst1_u32_ex( pDestination, VL, 64 ); 1221#elif defined(_XM_SSE_INTRINSICS_) 1222 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); 1223#else // _XM_VMX128_INTRINSICS_ 1224#endif // _XM_VMX128_INTRINSICS_ 1225} 1226 1227//------------------------------------------------------------------------------ 1228_Use_decl_annotations_ 1229inline void XMStoreFloat2 1230( 1231 XMFLOAT2* pDestination, 1232 FXMVECTOR V 1233) 1234{ 1235 assert(pDestination); 1236#if defined(_XM_NO_INTRINSICS_) 1237 pDestination->x = V.vector4_f32[0]; 1238 pDestination->y = V.vector4_f32[1]; 1239#elif defined(_XM_ARM_NEON_INTRINSICS_) 1240 __n64 VL = vget_low_f32(V); 1241 vst1_f32( reinterpret_cast<float*>(pDestination), VL ); 1242#elif defined(_XM_SSE_INTRINSICS_) 1243 XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) ); 1244 _mm_store_ss( &pDestination->x, V ); 1245 _mm_store_ss( &pDestination->y, T ); 1246#else // _XM_VMX128_INTRINSICS_ 1247#endif // _XM_VMX128_INTRINSICS_ 1248} 1249 1250//------------------------------------------------------------------------------ 1251_Use_decl_annotations_ 1252inline void XMStoreFloat2A 1253( 1254 XMFLOAT2A* pDestination, 1255 FXMVECTOR V 1256) 1257{ 1258 assert(pDestination); 1259 assert(((uintptr_t)pDestination & 0xF) == 0); 1260#if defined(_XM_NO_INTRINSICS_) 1261 pDestination->x = V.vector4_f32[0]; 1262 pDestination->y = V.vector4_f32[1]; 1263#elif defined(_XM_ARM_NEON_INTRINSICS_) 1264 __n64 VL = vget_low_f32(V); 1265 vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 ); 1266#elif defined(_XM_SSE_INTRINSICS_) 1267 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); 1268#else // _XM_VMX128_INTRINSICS_ 1269#endif // _XM_VMX128_INTRINSICS_ 1270} 1271 1272//------------------------------------------------------------------------------ 1273_Use_decl_annotations_ 1274inline void XMStoreSInt2 1275( 1276 XMINT2* pDestination, 1277 FXMVECTOR V 1278) 1279{ 1280 assert(pDestination); 1281#if defined(_XM_NO_INTRINSICS_) 1282 pDestination->x = (int32_t)V.vector4_f32[0]; 1283 pDestination->y = (int32_t)V.vector4_f32[1]; 1284#elif defined(_XM_ARM_NEON_INTRINSICS_) 1285 __n64 v = vget_low_s32(V); 1286 v = vcvt_s32_f32( v ); 1287 vst1_s32( reinterpret_cast<int32_t*>(pDestination), v ); 1288#elif defined(_XM_SSE_INTRINSICS_) 1289 // In case of positive overflow, detect it 1290 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); 1291 // Float to int conversion 1292 __m128i vResulti = _mm_cvttps_epi32(V); 1293 // If there was positive overflow, set to 0x7FFFFFFF 1294 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); 1295 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); 1296 vOverflow = _mm_or_ps(vOverflow,vResult); 1297 // Write two ints 1298 XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) ); 1299 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow ); 1300 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T ); 1301#else // _XM_VMX128_INTRINSICS_ 1302#endif // _XM_VMX128_INTRINSICS_ 1303} 1304 1305//------------------------------------------------------------------------------ 1306_Use_decl_annotations_ 1307inline void XMStoreUInt2 1308( 1309 XMUINT2* pDestination, 1310 FXMVECTOR V 1311) 1312{ 1313 assert(pDestination); 1314#if defined(_XM_NO_INTRINSICS_) 1315 pDestination->x = (uint32_t)V.vector4_f32[0]; 1316 pDestination->y = (uint32_t)V.vector4_f32[1]; 1317#elif defined(_XM_ARM_NEON_INTRINSICS_) 1318 __n64 v = vget_low_u32(V); 1319 v = vcvt_u32_f32( v ); 1320 vst1_u32( reinterpret_cast<uint32_t*>(pDestination), v ); 1321#elif defined(_XM_SSE_INTRINSICS_) 1322 // Clamp to >=0 1323 XMVECTOR vResult = _mm_max_ps(V,g_XMZero); 1324 // Any numbers that are too big, set to 0xFFFFFFFFU 1325 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); 1326 XMVECTOR vValue = g_XMUnsignedFix; 1327 // Too large for a signed integer? 1328 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); 1329 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise 1330 vValue = _mm_and_ps(vValue,vMask); 1331 // Perform fixup only on numbers too large (Keeps low bit precision) 1332 vResult = _mm_sub_ps(vResult,vValue); 1333 __m128i vResulti = _mm_cvttps_epi32(vResult); 1334 // Convert from signed to unsigned pnly if greater than 0x80000000 1335 vMask = _mm_and_ps(vMask,g_XMNegativeZero); 1336 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); 1337 // On those that are too large, set to 0xFFFFFFFF 1338 vResult = _mm_or_ps(vResult,vOverflow); 1339 // Write two uints 1340 XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) ); 1341 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult ); 1342 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T ); 1343#else // _XM_VMX128_INTRINSICS_ 1344#endif // _XM_VMX128_INTRINSICS_ 1345} 1346 1347//------------------------------------------------------------------------------ 1348_Use_decl_annotations_ 1349inline void XMStoreInt3 1350( 1351 uint32_t* pDestination, 1352 FXMVECTOR V 1353) 1354{ 1355 assert(pDestination); 1356#if defined(_XM_NO_INTRINSICS_) 1357 pDestination[0] = V.vector4_u32[0]; 1358 pDestination[1] = V.vector4_u32[1]; 1359 pDestination[2] = V.vector4_u32[2]; 1360#elif defined(_XM_ARM_NEON_INTRINSICS_) 1361 __n64 VL = vget_low_u32(V); 1362 vst1_u32( pDestination, VL ); 1363 vst1q_lane_u32( pDestination+2, V, 2 ); 1364#elif defined(_XM_SSE_INTRINSICS_) 1365 XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 1366 XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 1367 _mm_store_ss( reinterpret_cast<float*>(pDestination), V ); 1368 _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 ); 1369 _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 ); 1370#else // _XM_VMX128_INTRINSICS_ 1371#endif // _XM_VMX128_INTRINSICS_ 1372} 1373 1374//------------------------------------------------------------------------------ 1375_Use_decl_annotations_ 1376inline void XMStoreInt3A 1377( 1378 uint32_t* pDestination, 1379 FXMVECTOR V 1380) 1381{ 1382 assert(pDestination); 1383 assert(((uintptr_t)pDestination & 0xF) == 0); 1384#if defined(_XM_NO_INTRINSICS_) 1385 pDestination[0] = V.vector4_u32[0]; 1386 pDestination[1] = V.vector4_u32[1]; 1387 pDestination[2] = V.vector4_u32[2]; 1388#elif defined(_XM_ARM_NEON_INTRINSICS_) 1389 __n64 VL = vget_low_u32(V); 1390 vst1_u32_ex( pDestination, VL, 64 ); 1391 vst1q_lane_u32( pDestination+2, V, 2 ); 1392#elif defined(_XM_SSE_INTRINSICS_) 1393 XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 1394 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); 1395 _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T ); 1396#else // _XM_VMX128_INTRINSICS_ 1397#endif // _XM_VMX128_INTRINSICS_ 1398} 1399 1400//------------------------------------------------------------------------------ 1401_Use_decl_annotations_ 1402inline void XMStoreFloat3 1403( 1404 XMFLOAT3* pDestination, 1405 FXMVECTOR V 1406) 1407{ 1408 assert(pDestination); 1409#if defined(_XM_NO_INTRINSICS_) 1410 pDestination->x = V.vector4_f32[0]; 1411 pDestination->y = V.vector4_f32[1]; 1412 pDestination->z = V.vector4_f32[2]; 1413#elif defined(_XM_ARM_NEON_INTRINSICS_) 1414 __n64 VL = vget_low_f32(V); 1415 vst1_f32( reinterpret_cast<float*>(pDestination), VL ); 1416 vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 ); 1417#elif defined(_XM_SSE_INTRINSICS_) 1418 XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 1419 XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 1420 _mm_store_ss( &pDestination->x, V ); 1421 _mm_store_ss( &pDestination->y, T1 ); 1422 _mm_store_ss( &pDestination->z, T2 ); 1423#else // _XM_VMX128_INTRINSICS_ 1424#endif // _XM_VMX128_INTRINSICS_ 1425} 1426 1427//------------------------------------------------------------------------------ 1428_Use_decl_annotations_ 1429inline void XMStoreFloat3A 1430( 1431 XMFLOAT3A* pDestination, 1432 FXMVECTOR V 1433) 1434{ 1435 assert(pDestination); 1436 assert(((uintptr_t)pDestination & 0xF) == 0); 1437#if defined(_XM_NO_INTRINSICS_) 1438 pDestination->x = V.vector4_f32[0]; 1439 pDestination->y = V.vector4_f32[1]; 1440 pDestination->z = V.vector4_f32[2]; 1441#elif defined(_XM_ARM_NEON_INTRINSICS_) 1442 __n64 VL = vget_low_f32(V); 1443 vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 ); 1444 vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 ); 1445#elif defined(_XM_SSE_INTRINSICS_) 1446 XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 1447 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); 1448 _mm_store_ss( &pDestination->z, T ); 1449#else // _XM_VMX128_INTRINSICS_ 1450#endif // _XM_VMX128_INTRINSICS_ 1451} 1452 1453//------------------------------------------------------------------------------ 1454_Use_decl_annotations_ 1455inline void XMStoreSInt3 1456( 1457 XMINT3* pDestination, 1458 FXMVECTOR V 1459) 1460{ 1461 assert(pDestination); 1462#if defined(_XM_NO_INTRINSICS_) 1463 pDestination->x = (int32_t)V.vector4_f32[0]; 1464 pDestination->y = (int32_t)V.vector4_f32[1]; 1465 pDestination->z = (int32_t)V.vector4_f32[2]; 1466#elif defined(_XM_ARM_NEON_INTRINSICS_) 1467 __n128 v = vcvtq_s32_f32(V); 1468 __n64 vL = vget_low_s32(v); 1469 vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL ); 1470 vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 ); 1471#elif defined(_XM_SSE_INTRINSICS_) 1472 // In case of positive overflow, detect it 1473 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); 1474 // Float to int conversion 1475 __m128i vResulti = _mm_cvttps_epi32(V); 1476 // If there was positive overflow, set to 0x7FFFFFFF 1477 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); 1478 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); 1479 vOverflow = _mm_or_ps(vOverflow,vResult); 1480 // Write 3 uints 1481 XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1)); 1482 XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2)); 1483 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow ); 1484 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 ); 1485 _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 ); 1486#else // _XM_VMX128_INTRINSICS_ 1487#endif // _XM_VMX128_INTRINSICS_ 1488} 1489 1490//------------------------------------------------------------------------------ 1491_Use_decl_annotations_ 1492inline void XMStoreUInt3 1493( 1494 XMUINT3* pDestination, 1495 FXMVECTOR V 1496) 1497{ 1498 assert(pDestination); 1499#if defined(_XM_NO_INTRINSICS_) 1500 pDestination->x = (uint32_t)V.vector4_f32[0]; 1501 pDestination->y = (uint32_t)V.vector4_f32[1]; 1502 pDestination->z = (uint32_t)V.vector4_f32[2]; 1503#elif defined(_XM_ARM_NEON_INTRINSICS_) 1504 __n128 v = vcvtq_u32_f32(V); 1505 __n64 vL = vget_low_u32(v); 1506 vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL ); 1507 vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 ); 1508#elif defined(_XM_SSE_INTRINSICS_) 1509 // Clamp to >=0 1510 XMVECTOR vResult = _mm_max_ps(V,g_XMZero); 1511 // Any numbers that are too big, set to 0xFFFFFFFFU 1512 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); 1513 XMVECTOR vValue = g_XMUnsignedFix; 1514 // Too large for a signed integer? 1515 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); 1516 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise 1517 vValue = _mm_and_ps(vValue,vMask); 1518 // Perform fixup only on numbers too large (Keeps low bit precision) 1519 vResult = _mm_sub_ps(vResult,vValue); 1520 __m128i vResulti = _mm_cvttps_epi32(vResult); 1521 // Convert from signed to unsigned pnly if greater than 0x80000000 1522 vMask = _mm_and_ps(vMask,g_XMNegativeZero); 1523 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); 1524 // On those that are too large, set to 0xFFFFFFFF 1525 vResult = _mm_or_ps(vResult,vOverflow); 1526 // Write 3 uints 1527 XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); 1528 XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2)); 1529 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult ); 1530 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 ); 1531 _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 ); 1532#else // _XM_VMX128_INTRINSICS_ 1533#endif // _XM_VMX128_INTRINSICS_ 1534} 1535 1536//------------------------------------------------------------------------------ 1537_Use_decl_annotations_ 1538inline void XMStoreInt4 1539( 1540 uint32_t* pDestination, 1541 FXMVECTOR V 1542) 1543{ 1544 assert(pDestination); 1545#if defined(_XM_NO_INTRINSICS_) 1546 pDestination[0] = V.vector4_u32[0]; 1547 pDestination[1] = V.vector4_u32[1]; 1548 pDestination[2] = V.vector4_u32[2]; 1549 pDestination[3] = V.vector4_u32[3]; 1550#elif defined(_XM_ARM_NEON_INTRINSICS_) 1551 vst1q_u32( pDestination, V ); 1552#elif defined(_XM_SSE_INTRINSICS_) 1553 _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); 1554#else // _XM_VMX128_INTRINSICS_ 1555#endif // _XM_VMX128_INTRINSICS_ 1556} 1557 1558//------------------------------------------------------------------------------ 1559_Use_decl_annotations_ 1560inline void XMStoreInt4A 1561( 1562 uint32_t* pDestination, 1563 FXMVECTOR V 1564) 1565{ 1566 assert(pDestination); 1567 assert(((uintptr_t)pDestination & 0xF) == 0); 1568#if defined(_XM_NO_INTRINSICS_) 1569 pDestination[0] = V.vector4_u32[0]; 1570 pDestination[1] = V.vector4_u32[1]; 1571 pDestination[2] = V.vector4_u32[2]; 1572 pDestination[3] = V.vector4_u32[3]; 1573#elif defined(_XM_ARM_NEON_INTRINSICS_) 1574 vst1q_u32_ex( pDestination, V, 128 ); 1575#elif defined(_XM_SSE_INTRINSICS_) 1576 _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) ); 1577#else // _XM_VMX128_INTRINSICS_ 1578#endif // _XM_VMX128_INTRINSICS_ 1579} 1580 1581 1582//------------------------------------------------------------------------------ 1583_Use_decl_annotations_ 1584inline void XMStoreFloat4 1585( 1586 XMFLOAT4* pDestination, 1587 FXMVECTOR V 1588) 1589{ 1590 assert(pDestination); 1591#if defined(_XM_NO_INTRINSICS_) 1592 pDestination->x = V.vector4_f32[0]; 1593 pDestination->y = V.vector4_f32[1]; 1594 pDestination->z = V.vector4_f32[2]; 1595 pDestination->w = V.vector4_f32[3]; 1596#elif defined(_XM_ARM_NEON_INTRINSICS_) 1597 vst1q_f32( reinterpret_cast<float*>(pDestination), V ); 1598#elif defined(_XM_SSE_INTRINSICS_) 1599 _mm_storeu_ps( &pDestination->x, V ); 1600#else // _XM_VMX128_INTRINSICS_ 1601#endif // _XM_VMX128_INTRINSICS_ 1602} 1603 1604//------------------------------------------------------------------------------ 1605_Use_decl_annotations_ 1606inline void XMStoreFloat4A 1607( 1608 XMFLOAT4A* pDestination, 1609 FXMVECTOR V 1610) 1611{ 1612 assert(pDestination); 1613 assert(((uintptr_t)pDestination & 0xF) == 0); 1614#if defined(_XM_NO_INTRINSICS_) 1615 pDestination->x = V.vector4_f32[0]; 1616 pDestination->y = V.vector4_f32[1]; 1617 pDestination->z = V.vector4_f32[2]; 1618 pDestination->w = V.vector4_f32[3]; 1619#elif defined(_XM_ARM_NEON_INTRINSICS_) 1620 vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 ); 1621#elif defined(_XM_SSE_INTRINSICS_) 1622 _mm_store_ps( &pDestination->x, V ); 1623#else // _XM_VMX128_INTRINSICS_ 1624#endif // _XM_VMX128_INTRINSICS_ 1625} 1626 1627 1628//------------------------------------------------------------------------------ 1629_Use_decl_annotations_ 1630inline void XMStoreSInt4 1631( 1632 XMINT4* pDestination, 1633 FXMVECTOR V 1634) 1635{ 1636 assert(pDestination); 1637#if defined(_XM_NO_INTRINSICS_) 1638 pDestination->x = (int32_t)V.vector4_f32[0]; 1639 pDestination->y = (int32_t)V.vector4_f32[1]; 1640 pDestination->z = (int32_t)V.vector4_f32[2]; 1641 pDestination->w = (int32_t)V.vector4_f32[3]; 1642#elif defined(_XM_ARM_NEON_INTRINSICS_) 1643 __n128 v = vcvtq_s32_f32(V); 1644 vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v ); 1645#elif defined(_XM_SSE_INTRINSICS_) 1646 // In case of positive overflow, detect it 1647 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt); 1648 // Float to int conversion 1649 __m128i vResulti = _mm_cvttps_epi32(V); 1650 // If there was positive overflow, set to 0x7FFFFFFF 1651 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask); 1652 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti)); 1653 vOverflow = _mm_or_ps(vOverflow,vResult); 1654 _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) ); 1655#else // _XM_VMX128_INTRINSICS_ 1656#endif // _XM_VMX128_INTRINSICS_ 1657} 1658 1659//------------------------------------------------------------------------------ 1660_Use_decl_annotations_ 1661inline void XMStoreUInt4 1662( 1663 XMUINT4* pDestination, 1664 FXMVECTOR V 1665) 1666{ 1667 assert(pDestination); 1668#if defined(_XM_NO_INTRINSICS_) 1669 pDestination->x = (uint32_t)V.vector4_f32[0]; 1670 pDestination->y = (uint32_t)V.vector4_f32[1]; 1671 pDestination->z = (uint32_t)V.vector4_f32[2]; 1672 pDestination->w = (uint32_t)V.vector4_f32[3]; 1673#elif defined(_XM_ARM_NEON_INTRINSICS_) 1674 __n128 v = vcvtq_u32_f32(V); 1675 vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v ); 1676#elif defined(_XM_SSE_INTRINSICS_) 1677 // Clamp to >=0 1678 XMVECTOR vResult = _mm_max_ps(V,g_XMZero); 1679 // Any numbers that are too big, set to 0xFFFFFFFFU 1680 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt); 1681 XMVECTOR vValue = g_XMUnsignedFix; 1682 // Too large for a signed integer? 1683 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue); 1684 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise 1685 vValue = _mm_and_ps(vValue,vMask); 1686 // Perform fixup only on numbers too large (Keeps low bit precision) 1687 vResult = _mm_sub_ps(vResult,vValue); 1688 __m128i vResulti = _mm_cvttps_epi32(vResult); 1689 // Convert from signed to unsigned pnly if greater than 0x80000000 1690 vMask = _mm_and_ps(vMask,g_XMNegativeZero); 1691 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask); 1692 // On those that are too large, set to 0xFFFFFFFF 1693 vResult = _mm_or_ps(vResult,vOverflow); 1694 _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) ); 1695#else // _XM_VMX128_INTRINSICS_ 1696#endif // _XM_VMX128_INTRINSICS_ 1697} 1698 1699//------------------------------------------------------------------------------ 1700_Use_decl_annotations_ 1701inline void XMStoreFloat3x3 1702( 1703 XMFLOAT3X3* pDestination, 1704 CXMMATRIX M 1705) 1706{ 1707 assert(pDestination); 1708#if defined(_XM_NO_INTRINSICS_) 1709 1710 pDestination->m[0][0] = M.r[0].vector4_f32[0]; 1711 pDestination->m[0][1] = M.r[0].vector4_f32[1]; 1712 pDestination->m[0][2] = M.r[0].vector4_f32[2]; 1713 1714 pDestination->m[1][0] = M.r[1].vector4_f32[0]; 1715 pDestination->m[1][1] = M.r[1].vector4_f32[1]; 1716 pDestination->m[1][2] = M.r[1].vector4_f32[2]; 1717 1718 pDestination->m[2][0] = M.r[2].vector4_f32[0]; 1719 pDestination->m[2][1] = M.r[2].vector4_f32[1]; 1720 pDestination->m[2][2] = M.r[2].vector4_f32[2]; 1721 1722#elif defined(_XM_ARM_NEON_INTRINSICS_) 1723 __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); 1724 __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); 1725 vst1q_f32( &pDestination->m[0][0], T2 ); 1726 1727 T1 = vextq_f32( M.r[1], M.r[1], 1 ); 1728 T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); 1729 vst1q_f32( &pDestination->m[1][1], T2 ); 1730 1731 vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 ); 1732#elif defined(_XM_SSE_INTRINSICS_) 1733 XMVECTOR vTemp1 = M.r[0]; 1734 XMVECTOR vTemp2 = M.r[1]; 1735 XMVECTOR vTemp3 = M.r[2]; 1736 XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2)); 1737 vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0)); 1738 _mm_storeu_ps(&pDestination->m[0][0],vTemp1); 1739 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); 1740 _mm_storeu_ps(&pDestination->m[1][1],vTemp2); 1741 vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2)); 1742 _mm_store_ss(&pDestination->m[2][2],vTemp3); 1743#else // _XM_VMX128_INTRINSICS_ 1744#endif // _XM_VMX128_INTRINSICS_ 1745} 1746 1747 1748//------------------------------------------------------------------------------ 1749_Use_decl_annotations_ 1750inline void XMStoreFloat4x3 1751( 1752 XMFLOAT4X3* pDestination, 1753 CXMMATRIX M 1754) 1755{ 1756 assert(pDestination); 1757#if defined(_XM_NO_INTRINSICS_) 1758 1759 pDestination->m[0][0] = M.r[0].vector4_f32[0]; 1760 pDestination->m[0][1] = M.r[0].vector4_f32[1]; 1761 pDestination->m[0][2] = M.r[0].vector4_f32[2]; 1762 1763 pDestination->m[1][0] = M.r[1].vector4_f32[0]; 1764 pDestination->m[1][1] = M.r[1].vector4_f32[1]; 1765 pDestination->m[1][2] = M.r[1].vector4_f32[2]; 1766 1767 pDestination->m[2][0] = M.r[2].vector4_f32[0]; 1768 pDestination->m[2][1] = M.r[2].vector4_f32[1]; 1769 pDestination->m[2][2] = M.r[2].vector4_f32[2]; 1770 1771 pDestination->m[3][0] = M.r[3].vector4_f32[0]; 1772 pDestination->m[3][1] = M.r[3].vector4_f32[1]; 1773 pDestination->m[3][2] = M.r[3].vector4_f32[2]; 1774 1775#elif defined(_XM_ARM_NEON_INTRINSICS_) 1776 __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); 1777 __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); 1778 vst1q_f32( &pDestination->m[0][0], T2 ); 1779 1780 T1 = vextq_f32( M.r[1], M.r[1], 1 ); 1781 T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); 1782 vst1q_f32( &pDestination->m[1][1], T2 ); 1783 1784 T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); 1785 T2 = vextq_f32( T1, M.r[3], 3 ); 1786 vst1q_f32( &pDestination->m[2][2], T2 ); 1787#elif defined(_XM_SSE_INTRINSICS_) 1788 XMVECTOR vTemp1 = M.r[0]; 1789 XMVECTOR vTemp2 = M.r[1]; 1790 XMVECTOR vTemp3 = M.r[2]; 1791 XMVECTOR vTemp4 = M.r[3]; 1792 XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); 1793 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0)); 1794 vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0)); 1795 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); 1796 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); 1797 _mm_storeu_ps(&pDestination->m[0][0],vTemp1); 1798 _mm_storeu_ps(&pDestination->m[1][1],vTemp2x); 1799 _mm_storeu_ps(&pDestination->m[2][2],vTemp3); 1800#else // _XM_VMX128_INTRINSICS_ 1801#endif // _XM_VMX128_INTRINSICS_ 1802} 1803 1804//------------------------------------------------------------------------------ 1805_Use_decl_annotations_ 1806inline void XMStoreFloat4x3A 1807( 1808 XMFLOAT4X3A* pDestination, 1809 CXMMATRIX M 1810) 1811{ 1812 assert(pDestination); 1813 assert(((uintptr_t)pDestination & 0xF) == 0); 1814#if defined(_XM_NO_INTRINSICS_) 1815 1816 pDestination->m[0][0] = M.r[0].vector4_f32[0]; 1817 pDestination->m[0][1] = M.r[0].vector4_f32[1]; 1818 pDestination->m[0][2] = M.r[0].vector4_f32[2]; 1819 1820 pDestination->m[1][0] = M.r[1].vector4_f32[0]; 1821 pDestination->m[1][1] = M.r[1].vector4_f32[1]; 1822 pDestination->m[1][2] = M.r[1].vector4_f32[2]; 1823 1824 pDestination->m[2][0] = M.r[2].vector4_f32[0]; 1825 pDestination->m[2][1] = M.r[2].vector4_f32[1]; 1826 pDestination->m[2][2] = M.r[2].vector4_f32[2]; 1827 1828 pDestination->m[3][0] = M.r[3].vector4_f32[0]; 1829 pDestination->m[3][1] = M.r[3].vector4_f32[1]; 1830 pDestination->m[3][2] = M.r[3].vector4_f32[2]; 1831 1832#elif defined(_XM_ARM_NEON_INTRINSICS_) 1833 __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 ); 1834 __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 ); 1835 vst1q_f32_ex( &pDestination->m[0][0], T2, 128 ); 1836 1837 T1 = vextq_f32( M.r[1], M.r[1], 1 ); 1838 T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) ); 1839 vst1q_f32_ex( &pDestination->m[1][1], T2, 128 ); 1840 1841 T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 ); 1842 T2 = vextq_f32( T1, M.r[3], 3 ); 1843 vst1q_f32_ex( &pDestination->m[2][2], T2, 128 ); 1844#elif defined(_XM_SSE_INTRINSICS_) 1845 // x1,y1,z1,w1 1846 XMVECTOR vTemp1 = M.r[0]; 1847 // x2,y2,z2,w2 1848 XMVECTOR vTemp2 = M.r[1]; 1849 // x3,y3,z3,w3 1850 XMVECTOR vTemp3 = M.r[2]; 1851 // x4,y4,z4,w4 1852 XMVECTOR vTemp4 = M.r[3]; 1853 // z1,z1,x2,y2 1854 XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2)); 1855 // y2,z2,x3,y3 (Final) 1856 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1)); 1857 // x1,y1,z1,x2 (Final) 1858 vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0)); 1859 // z3,z3,x4,x4 1860 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2)); 1861 // z3,x4,y4,z4 (Final) 1862 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0)); 1863 // Store in 3 operations 1864 _mm_store_ps(&pDestination->m[0][0],vTemp1); 1865 _mm_store_ps(&pDestination->m[1][1],vTemp2); 1866 _mm_store_ps(&pDestination->m[2][2],vTemp3); 1867#else // _XM_VMX128_INTRINSICS_ 1868#endif // _XM_VMX128_INTRINSICS_ 1869} 1870 1871 1872//------------------------------------------------------------------------------ 1873_Use_decl_annotations_ 1874inline void XMStoreFloat4x4 1875( 1876 XMFLOAT4X4* pDestination, 1877 CXMMATRIX M 1878) 1879{ 1880 assert(pDestination); 1881#if defined(_XM_NO_INTRINSICS_) 1882 1883 pDestination->m[0][0] = M.r[0].vector4_f32[0]; 1884 pDestination->m[0][1] = M.r[0].vector4_f32[1]; 1885 pDestination->m[0][2] = M.r[0].vector4_f32[2]; 1886 pDestination->m[0][3] = M.r[0].vector4_f32[3]; 1887 1888 pDestination->m[1][0] = M.r[1].vector4_f32[0]; 1889 pDestination->m[1][1] = M.r[1].vector4_f32[1]; 1890 pDestination->m[1][2] = M.r[1].vector4_f32[2]; 1891 pDestination->m[1][3] = M.r[1].vector4_f32[3]; 1892 1893 pDestination->m[2][0] = M.r[2].vector4_f32[0]; 1894 pDestination->m[2][1] = M.r[2].vector4_f32[1]; 1895 pDestination->m[2][2] = M.r[2].vector4_f32[2]; 1896 pDestination->m[2][3] = M.r[2].vector4_f32[3]; 1897 1898 pDestination->m[3][0] = M.r[3].vector4_f32[0]; 1899 pDestination->m[3][1] = M.r[3].vector4_f32[1]; 1900 pDestination->m[3][2] = M.r[3].vector4_f32[2]; 1901 pDestination->m[3][3] = M.r[3].vector4_f32[3]; 1902 1903#elif defined(_XM_ARM_NEON_INTRINSICS_) 1904 vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] ); 1905 vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] ); 1906 vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] ); 1907 vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] ); 1908#elif defined(_XM_SSE_INTRINSICS_) 1909 _mm_storeu_ps( &pDestination->_11, M.r[0] ); 1910 _mm_storeu_ps( &pDestination->_21, M.r[1] ); 1911 _mm_storeu_ps( &pDestination->_31, M.r[2] ); 1912 _mm_storeu_ps( &pDestination->_41, M.r[3] ); 1913#else // _XM_VMX128_INTRINSICS_ 1914#endif // _XM_VMX128_INTRINSICS_ 1915} 1916 1917//------------------------------------------------------------------------------ 1918_Use_decl_annotations_ 1919inline void XMStoreFloat4x4A 1920( 1921 XMFLOAT4X4A* pDestination, 1922 CXMMATRIX M 1923) 1924{ 1925 assert(pDestination); 1926 assert(((uintptr_t)pDestination & 0xF) == 0); 1927#if defined(_XM_NO_INTRINSICS_) 1928 1929 pDestination->m[0][0] = M.r[0].vector4_f32[0]; 1930 pDestination->m[0][1] = M.r[0].vector4_f32[1]; 1931 pDestination->m[0][2] = M.r[0].vector4_f32[2]; 1932 pDestination->m[0][3] = M.r[0].vector4_f32[3]; 1933 1934 pDestination->m[1][0] = M.r[1].vector4_f32[0]; 1935 pDestination->m[1][1] = M.r[1].vector4_f32[1]; 1936 pDestination->m[1][2] = M.r[1].vector4_f32[2]; 1937 pDestination->m[1][3] = M.r[1].vector4_f32[3]; 1938 1939 pDestination->m[2][0] = M.r[2].vector4_f32[0]; 1940 pDestination->m[2][1] = M.r[2].vector4_f32[1]; 1941 pDestination->m[2][2] = M.r[2].vector4_f32[2]; 1942 pDestination->m[2][3] = M.r[2].vector4_f32[3]; 1943 1944 pDestination->m[3][0] = M.r[3].vector4_f32[0]; 1945 pDestination->m[3][1] = M.r[3].vector4_f32[1]; 1946 pDestination->m[3][2] = M.r[3].vector4_f32[2]; 1947 pDestination->m[3][3] = M.r[3].vector4_f32[3]; 1948 1949#elif defined(_XM_ARM_NEON_INTRINSICS_) 1950 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 ); 1951 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 ); 1952 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 ); 1953 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 ); 1954#elif defined(_XM_SSE_INTRINSICS_) 1955 _mm_store_ps( &pDestination->_11, M.r[0] ); 1956 _mm_store_ps( &pDestination->_21, M.r[1] ); 1957 _mm_store_ps( &pDestination->_31, M.r[2] ); 1958 _mm_store_ps( &pDestination->_41, M.r[3] ); 1959#else // _XM_VMX128_INTRINSICS_ 1960#endif // _XM_VMX128_INTRINSICS_ 1961} 1962