the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
at master 10596 lines 338 kB view raw
1//------------------------------------------------------------------------------------- 2// DirectXMathVector.inl -- SIMD C++ Math library 3// 4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF 5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO 6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A 7// PARTICULAR PURPOSE. 8// 9// Copyright (c) Microsoft Corporation. All rights reserved. 10//------------------------------------------------------------------------------------- 11 12#ifdef _MSC_VER 13#pragma once 14#endif 15 16#if defined(_XM_NO_INTRINSICS_) 17#define XMISNAN(x) ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0) 18#define XMISINF(x) ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000) 19#endif 20 21/**************************************************************************** 22 * 23 * General Vector 24 * 25 ****************************************************************************/ 26 27//------------------------------------------------------------------------------ 28// Assignment operations 29//------------------------------------------------------------------------------ 30 31//------------------------------------------------------------------------------ 32// Return a vector with all elements equaling zero 33inline XMVECTOR XMVectorZero() 34{ 35#if defined(_XM_NO_INTRINSICS_) 36 XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; 37 return vResult; 38#elif defined(_XM_ARM_NEON_INTRINSICS_) 39 return vdupq_n_u32(0); 40#elif defined(_XM_SSE_INTRINSICS_) 41 return _mm_setzero_ps(); 42#else // _XM_VMX128_INTRINSICS_ 43#endif // _XM_VMX128_INTRINSICS_ 44} 45 46//------------------------------------------------------------------------------ 47// Initialize a vector with four floating point values 48inline XMVECTOR XMVectorSet 49( 50 float x, 51 float y, 52 float z, 53 float w 54) 55{ 56#if defined(_XM_NO_INTRINSICS_) 57 XMVECTORF32 vResult = {x,y,z,w}; 58 return vResult.v; 59#elif defined(_XM_ARM_NEON_INTRINSICS_) 60 __n64 V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32)); 61 __n64 V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32)); 62 return vcombine_f32(V0, V1); 63#elif defined(_XM_SSE_INTRINSICS_) 64 return _mm_set_ps( w, z, y, x ); 65#else // _XM_VMX128_INTRINSICS_ 66#endif // _XM_VMX128_INTRINSICS_ 67} 68 69//------------------------------------------------------------------------------ 70// Initialize a vector with four integer values 71inline XMVECTOR XMVectorSetInt 72( 73 uint32_t x, 74 uint32_t y, 75 uint32_t z, 76 uint32_t w 77) 78{ 79#if defined(_XM_NO_INTRINSICS_) 80 XMVECTORU32 vResult = {x,y,z,w}; 81 return vResult.v; 82#elif defined(_XM_ARM_NEON_INTRINSICS_) 83 __n64 V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32)); 84 __n64 V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32)); 85 return vcombine_u32(V0, V1); 86#elif defined(_XM_SSE_INTRINSICS_) 87 __m128i V = _mm_set_epi32( w, z, y, x ); 88 return reinterpret_cast<__m128 *>(&V)[0]; 89#else // _XM_VMX128_INTRINSICS_ 90#endif // _XM_VMX128_INTRINSICS_ 91} 92 93//------------------------------------------------------------------------------ 94// Initialize a vector with a replicated floating point value 95inline XMVECTOR XMVectorReplicate 96( 97 float Value 98) 99{ 100#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 101 XMVECTORF32 vResult = {Value,Value,Value,Value}; 102 return vResult.v; 103#elif defined(_XM_ARM_NEON_INTRINSICS_) 104 return vdupq_n_f32( Value ); 105#elif defined(_XM_SSE_INTRINSICS_) 106 return _mm_set_ps1( Value ); 107#else // _XM_VMX128_INTRINSICS_ 108#endif // _XM_VMX128_INTRINSICS_ 109} 110 111//------------------------------------------------------------------------------ 112// Initialize a vector with a replicated floating point value passed by pointer 113_Use_decl_annotations_ 114inline XMVECTOR XMVectorReplicatePtr 115( 116 const float *pValue 117) 118{ 119#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 120 float Value = pValue[0]; 121 XMVECTORF32 vResult = {Value,Value,Value,Value}; 122 return vResult.v; 123#elif defined(_XM_ARM_NEON_INTRINSICS_) 124 return vld1q_dup_f32( pValue ); 125#elif defined(_XM_SSE_INTRINSICS_) 126 return _mm_load_ps1( pValue ); 127#else // _XM_VMX128_INTRINSICS_ 128#endif // _XM_VMX128_INTRINSICS_ 129} 130 131//------------------------------------------------------------------------------ 132// Initialize a vector with a replicated integer value 133inline XMVECTOR XMVectorReplicateInt 134( 135 uint32_t Value 136) 137{ 138#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 139 XMVECTORU32 vResult = {Value,Value,Value,Value}; 140 return vResult.v; 141#elif defined(_XM_ARM_NEON_INTRINSICS_) 142 return vdupq_n_u32( Value ); 143#elif defined(_XM_SSE_INTRINSICS_) 144 __m128i vTemp = _mm_set1_epi32( Value ); 145 return _mm_castsi128_ps(vTemp); 146#else // _XM_VMX128_INTRINSICS_ 147#endif // _XM_VMX128_INTRINSICS_ 148} 149 150//------------------------------------------------------------------------------ 151// Initialize a vector with a replicated integer value passed by pointer 152_Use_decl_annotations_ 153inline XMVECTOR XMVectorReplicateIntPtr 154( 155 const uint32_t *pValue 156) 157{ 158#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 159 uint32_t Value = pValue[0]; 160 XMVECTORU32 vResult = {Value,Value,Value,Value}; 161 return vResult.v; 162#elif defined(_XM_ARM_NEON_INTRINSICS_) 163 return vld1q_dup_u32(pValue); 164#elif defined(_XM_SSE_INTRINSICS_) 165 return _mm_load_ps1(reinterpret_cast<const float *>(pValue)); 166#else // _XM_VMX128_INTRINSICS_ 167#endif // _XM_VMX128_INTRINSICS_ 168} 169 170//------------------------------------------------------------------------------ 171// Initialize a vector with all bits set (true mask) 172inline XMVECTOR XMVectorTrueInt() 173{ 174#if defined(_XM_NO_INTRINSICS_) 175 XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU}; 176 return vResult.v; 177#elif defined(_XM_ARM_NEON_INTRINSICS_) 178 return vdupq_n_s32(-1); 179#elif defined(_XM_SSE_INTRINSICS_) 180 __m128i V = _mm_set1_epi32(-1); 181 return reinterpret_cast<__m128 *>(&V)[0]; 182#else // _XM_VMX128_INTRINSICS_ 183#endif // _XM_VMX128_INTRINSICS_ 184} 185 186//------------------------------------------------------------------------------ 187// Initialize a vector with all bits clear (false mask) 188inline XMVECTOR XMVectorFalseInt() 189{ 190#if defined(_XM_NO_INTRINSICS_) 191 XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f}; 192 return vResult; 193#elif defined(_XM_ARM_NEON_INTRINSICS_) 194 return vdupq_n_u32(0); 195#elif defined(_XM_SSE_INTRINSICS_) 196 return _mm_setzero_ps(); 197#else // _XM_VMX128_INTRINSICS_ 198#endif // _XM_VMX128_INTRINSICS_ 199} 200 201//------------------------------------------------------------------------------ 202// Replicate the x component of the vector 203inline XMVECTOR XMVectorSplatX 204( 205 FXMVECTOR V 206) 207{ 208#if defined(_XM_NO_INTRINSICS_) 209 XMVECTOR vResult; 210 vResult.vector4_f32[0] = 211 vResult.vector4_f32[1] = 212 vResult.vector4_f32[2] = 213 vResult.vector4_f32[3] = V.vector4_f32[0]; 214 return vResult; 215#elif defined(_XM_ARM_NEON_INTRINSICS_) 216 return vdupq_lane_f32( vget_low_f32( V ), 0 ); 217#elif defined(_XM_SSE_INTRINSICS_) 218 return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) ); 219#else // _XM_VMX128_INTRINSICS_ 220#endif // _XM_VMX128_INTRINSICS_ 221} 222 223//------------------------------------------------------------------------------ 224// Replicate the y component of the vector 225inline XMVECTOR XMVectorSplatY 226( 227 FXMVECTOR V 228) 229{ 230#if defined(_XM_NO_INTRINSICS_) 231 XMVECTOR vResult; 232 vResult.vector4_f32[0] = 233 vResult.vector4_f32[1] = 234 vResult.vector4_f32[2] = 235 vResult.vector4_f32[3] = V.vector4_f32[1]; 236 return vResult; 237#elif defined(_XM_ARM_NEON_INTRINSICS_) 238 return vdupq_lane_f32( vget_low_f32( V ), 1 ); 239#elif defined(_XM_SSE_INTRINSICS_) 240 return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) ); 241#else // _XM_VMX128_INTRINSICS_ 242#endif // _XM_VMX128_INTRINSICS_ 243} 244 245//------------------------------------------------------------------------------ 246// Replicate the z component of the vector 247inline XMVECTOR XMVectorSplatZ 248( 249 FXMVECTOR V 250) 251{ 252#if defined(_XM_NO_INTRINSICS_) 253 XMVECTOR vResult; 254 vResult.vector4_f32[0] = 255 vResult.vector4_f32[1] = 256 vResult.vector4_f32[2] = 257 vResult.vector4_f32[3] = V.vector4_f32[2]; 258 return vResult; 259#elif defined(_XM_ARM_NEON_INTRINSICS_) 260 return vdupq_lane_f32( vget_high_f32( V ), 0 ); 261#elif defined(_XM_SSE_INTRINSICS_) 262 return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) ); 263#else // _XM_VMX128_INTRINSICS_ 264#endif // _XM_VMX128_INTRINSICS_ 265} 266 267//------------------------------------------------------------------------------ 268// Replicate the w component of the vector 269inline XMVECTOR XMVectorSplatW 270( 271 FXMVECTOR V 272) 273{ 274#if defined(_XM_NO_INTRINSICS_) 275 XMVECTOR vResult; 276 vResult.vector4_f32[0] = 277 vResult.vector4_f32[1] = 278 vResult.vector4_f32[2] = 279 vResult.vector4_f32[3] = V.vector4_f32[3]; 280 return vResult; 281#elif defined(_XM_ARM_NEON_INTRINSICS_) 282 return vdupq_lane_f32( vget_high_f32( V ), 1 ); 283#elif defined(_XM_SSE_INTRINSICS_) 284 return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) ); 285#else // _XM_VMX128_INTRINSICS_ 286#endif // _XM_VMX128_INTRINSICS_ 287} 288 289//------------------------------------------------------------------------------ 290// Return a vector of 1.0f,1.0f,1.0f,1.0f 291inline XMVECTOR XMVectorSplatOne() 292{ 293#if defined(_XM_NO_INTRINSICS_) 294 XMVECTOR vResult; 295 vResult.vector4_f32[0] = 296 vResult.vector4_f32[1] = 297 vResult.vector4_f32[2] = 298 vResult.vector4_f32[3] = 1.0f; 299 return vResult; 300#elif defined(_XM_ARM_NEON_INTRINSICS_) 301 return vdupq_n_f32(1.0f); 302#elif defined(_XM_SSE_INTRINSICS_) 303 return g_XMOne; 304#else // _XM_VMX128_INTRINSICS_ 305#endif // _XM_VMX128_INTRINSICS_ 306} 307 308//------------------------------------------------------------------------------ 309// Return a vector of INF,INF,INF,INF 310inline XMVECTOR XMVectorSplatInfinity() 311{ 312#if defined(_XM_NO_INTRINSICS_) 313 XMVECTOR vResult; 314 vResult.vector4_u32[0] = 315 vResult.vector4_u32[1] = 316 vResult.vector4_u32[2] = 317 vResult.vector4_u32[3] = 0x7F800000; 318 return vResult; 319#elif defined(_XM_ARM_NEON_INTRINSICS_) 320 return vdupq_n_u32(0x7F800000); 321#elif defined(_XM_SSE_INTRINSICS_) 322 return g_XMInfinity; 323#else // _XM_VMX128_INTRINSICS_ 324#endif // _XM_VMX128_INTRINSICS_ 325} 326 327//------------------------------------------------------------------------------ 328// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN 329inline XMVECTOR XMVectorSplatQNaN() 330{ 331#if defined(_XM_NO_INTRINSICS_) 332 XMVECTOR vResult; 333 vResult.vector4_u32[0] = 334 vResult.vector4_u32[1] = 335 vResult.vector4_u32[2] = 336 vResult.vector4_u32[3] = 0x7FC00000; 337 return vResult; 338#elif defined(_XM_ARM_NEON_INTRINSICS_) 339 return vdupq_n_u32(0x7FC00000); 340#elif defined(_XM_SSE_INTRINSICS_) 341 return g_XMQNaN; 342#else // _XM_VMX128_INTRINSICS_ 343#endif // _XM_VMX128_INTRINSICS_ 344} 345 346//------------------------------------------------------------------------------ 347// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f 348inline XMVECTOR XMVectorSplatEpsilon() 349{ 350#if defined(_XM_NO_INTRINSICS_) 351 XMVECTOR vResult; 352 vResult.vector4_u32[0] = 353 vResult.vector4_u32[1] = 354 vResult.vector4_u32[2] = 355 vResult.vector4_u32[3] = 0x34000000; 356 return vResult; 357#elif defined(_XM_ARM_NEON_INTRINSICS_) 358 return vdupq_n_u32(0x34000000); 359#elif defined(_XM_SSE_INTRINSICS_) 360 return g_XMEpsilon; 361#else // _XM_VMX128_INTRINSICS_ 362#endif // _XM_VMX128_INTRINSICS_ 363} 364 365//------------------------------------------------------------------------------ 366// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f 367inline XMVECTOR XMVectorSplatSignMask() 368{ 369#if defined(_XM_NO_INTRINSICS_) 370 XMVECTOR vResult; 371 vResult.vector4_u32[0] = 372 vResult.vector4_u32[1] = 373 vResult.vector4_u32[2] = 374 vResult.vector4_u32[3] = 0x80000000U; 375 return vResult; 376#elif defined(_XM_ARM_NEON_INTRINSICS_) 377 return vdupq_n_u32(0x80000000U); 378#elif defined(_XM_SSE_INTRINSICS_) 379 __m128i V = _mm_set1_epi32( 0x80000000 ); 380 return reinterpret_cast<__m128*>(&V)[0]; 381#else // _XM_VMX128_INTRINSICS_ 382#endif // _XM_VMX128_INTRINSICS_ 383} 384 385//------------------------------------------------------------------------------ 386// Return a floating point value via an index. This is not a recommended 387// function to use due to performance loss. 388inline float XMVectorGetByIndex(FXMVECTOR V, size_t i) 389{ 390 assert( i < 4 ); 391 _Analysis_assume_( i < 4 ); 392#if defined(_XM_NO_INTRINSICS_) 393 return V.vector4_f32[i]; 394#elif defined(_XM_ARM_NEON_INTRINSICS_) 395 return V.n128_f32[i]; 396#elif defined(_XM_SSE_INTRINSICS_) 397 return V.m128_f32[i]; 398#else // _XM_VMX128_INTRINSICS_ 399#endif // _XM_VMX128_INTRINSICS_ 400} 401 402//------------------------------------------------------------------------------ 403// Return the X component in an FPU register. 404inline float XMVectorGetX(FXMVECTOR V) 405{ 406#if defined(_XM_NO_INTRINSICS_) 407 return V.vector4_f32[0]; 408#elif defined(_XM_ARM_NEON_INTRINSICS_) 409 return vgetq_lane_f32(V, 0); 410#elif defined(_XM_SSE_INTRINSICS_) 411 return _mm_cvtss_f32(V); 412#else // _XM_VMX128_INTRINSICS_ 413#endif // _XM_VMX128_INTRINSICS_ 414} 415 416// Return the Y component in an FPU register. 417inline float XMVectorGetY(FXMVECTOR V) 418{ 419#if defined(_XM_NO_INTRINSICS_) 420 return V.vector4_f32[1]; 421#elif defined(_XM_ARM_NEON_INTRINSICS_) 422 return vgetq_lane_f32(V, 1); 423#elif defined(_XM_SSE_INTRINSICS_) 424 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 425 return _mm_cvtss_f32(vTemp); 426#else // _XM_VMX128_INTRINSICS_ 427#endif // _XM_VMX128_INTRINSICS_ 428} 429 430// Return the Z component in an FPU register. 431inline float XMVectorGetZ(FXMVECTOR V) 432{ 433#if defined(_XM_NO_INTRINSICS_) 434 return V.vector4_f32[2]; 435#elif defined(_XM_ARM_NEON_INTRINSICS_) 436 return vgetq_lane_f32(V, 2); 437#elif defined(_XM_SSE_INTRINSICS_) 438 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 439 return _mm_cvtss_f32(vTemp); 440#else // _XM_VMX128_INTRINSICS_ 441#endif // _XM_VMX128_INTRINSICS_ 442} 443 444// Return the W component in an FPU register. 445inline float XMVectorGetW(FXMVECTOR V) 446{ 447#if defined(_XM_NO_INTRINSICS_) 448 return V.vector4_f32[3]; 449#elif defined(_XM_ARM_NEON_INTRINSICS_) 450 return vgetq_lane_f32(V, 3); 451#elif defined(_XM_SSE_INTRINSICS_) 452 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); 453 return _mm_cvtss_f32(vTemp); 454#else // _XM_VMX128_INTRINSICS_ 455#endif // _XM_VMX128_INTRINSICS_ 456} 457 458//------------------------------------------------------------------------------ 459 460// Store a component indexed by i into a 32 bit float location in memory. 461_Use_decl_annotations_ 462inline void XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i) 463{ 464 assert( f != NULL ); 465 assert( i < 4 ); 466 _Analysis_assume_( i < 4 ); 467#if defined(_XM_NO_INTRINSICS_) 468 *f = V.vector4_f32[i]; 469#elif defined(_XM_ARM_NEON_INTRINSICS_) 470 *f = V.n128_f32[i]; 471#elif defined(_XM_SSE_INTRINSICS_) 472 *f = V.m128_f32[i]; 473#else // _XM_VMX128_INTRINSICS_ 474#endif // _XM_VMX128_INTRINSICS_ 475} 476 477//------------------------------------------------------------------------------ 478 479// Store the X component into a 32 bit float location in memory. 480_Use_decl_annotations_ 481inline void XMVectorGetXPtr(float *x, FXMVECTOR V) 482{ 483 assert( x != NULL); 484#if defined(_XM_NO_INTRINSICS_) 485 *x = V.vector4_f32[0]; 486#elif defined(_XM_ARM_NEON_INTRINSICS_) 487 vst1q_lane_f32(x,V,0); 488#elif defined(_XM_SSE_INTRINSICS_) 489 _mm_store_ss(x,V); 490#else // _XM_VMX128_INTRINSICS_ 491#endif // _XM_VMX128_INTRINSICS_ 492} 493 494// Store the Y component into a 32 bit float location in memory. 495_Use_decl_annotations_ 496inline void XMVectorGetYPtr(float *y, FXMVECTOR V) 497{ 498 assert( y != NULL ); 499#if defined(_XM_NO_INTRINSICS_) 500 *y = V.vector4_f32[1]; 501#elif defined(_XM_ARM_NEON_INTRINSICS_) 502 vst1q_lane_f32(y,V,1); 503#elif defined(_XM_SSE_INTRINSICS_) 504 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 505 _mm_store_ss(y,vResult); 506#else // _XM_VMX128_INTRINSICS_ 507#endif // _XM_VMX128_INTRINSICS_ 508} 509 510// Store the Z component into a 32 bit float location in memory. 511_Use_decl_annotations_ 512inline void XMVectorGetZPtr(float *z, FXMVECTOR V) 513{ 514 assert( z != NULL ); 515#if defined(_XM_NO_INTRINSICS_) 516 *z = V.vector4_f32[2]; 517#elif defined(_XM_ARM_NEON_INTRINSICS_) 518 vst1q_lane_f32(z,V,2); 519#elif defined(_XM_SSE_INTRINSICS_) 520 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 521 _mm_store_ss(z,vResult); 522#else // _XM_VMX128_INTRINSICS_ 523#endif // _XM_VMX128_INTRINSICS_ 524} 525 526// Store the W component into a 32 bit float location in memory. 527_Use_decl_annotations_ 528inline void XMVectorGetWPtr(float *w, FXMVECTOR V) 529{ 530 assert( w != NULL ); 531#if defined(_XM_NO_INTRINSICS_) 532 *w = V.vector4_f32[3]; 533#elif defined(_XM_ARM_NEON_INTRINSICS_) 534 vst1q_lane_f32(w,V,3); 535#elif defined(_XM_SSE_INTRINSICS_) 536 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); 537 _mm_store_ss(w,vResult); 538#else // _XM_VMX128_INTRINSICS_ 539#endif // _XM_VMX128_INTRINSICS_ 540} 541 542//------------------------------------------------------------------------------ 543 544// Return an integer value via an index. This is not a recommended 545// function to use due to performance loss. 546inline uint32_t XMVectorGetIntByIndex(FXMVECTOR V, size_t i) 547{ 548 assert( i < 4 ); 549 _Analysis_assume_( i < 4 ); 550#if defined(_XM_NO_INTRINSICS_) 551 return V.vector4_u32[i]; 552#elif defined(_XM_ARM_NEON_INTRINSICS_) 553 return V.n128_u32[i]; 554#elif defined(_XM_SSE_INTRINSICS_) 555 return V.m128_u32[i]; 556#else // _XM_VMX128_INTRINSICS_ 557#endif // _XM_VMX128_INTRINSICS_ 558} 559 560//------------------------------------------------------------------------------ 561 562// Return the X component in an integer register. 563inline uint32_t XMVectorGetIntX(FXMVECTOR V) 564{ 565#if defined(_XM_NO_INTRINSICS_) 566 return V.vector4_u32[0]; 567#elif defined(_XM_ARM_NEON_INTRINSICS_) 568 return vgetq_lane_u32(V, 0); 569#elif defined(_XM_SSE_INTRINSICS_) 570 return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V))); 571#else // _XM_VMX128_INTRINSICS_ 572#endif // _XM_VMX128_INTRINSICS_ 573} 574 575// Return the Y component in an integer register. 576inline uint32_t XMVectorGetIntY(FXMVECTOR V) 577{ 578#if defined(_XM_NO_INTRINSICS_) 579 return V.vector4_u32[1]; 580#elif defined(_XM_ARM_NEON_INTRINSICS_) 581 return vgetq_lane_u32(V, 1); 582#elif defined(_XM_SSE_INTRINSICS_) 583 __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1)); 584 return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti)); 585#else // _XM_VMX128_INTRINSICS_ 586#endif // _XM_VMX128_INTRINSICS_ 587} 588 589// Return the Z component in an integer register. 590inline uint32_t XMVectorGetIntZ(FXMVECTOR V) 591{ 592#if defined(_XM_NO_INTRINSICS_) 593 return V.vector4_u32[2]; 594#elif defined(_XM_ARM_NEON_INTRINSICS_) 595 return vgetq_lane_u32(V, 2); 596#elif defined(_XM_SSE_INTRINSICS_) 597 __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2)); 598 return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti)); 599#else // _XM_VMX128_INTRINSICS_ 600#endif // _XM_VMX128_INTRINSICS_ 601} 602 603// Return the W component in an integer register. 604inline uint32_t XMVectorGetIntW(FXMVECTOR V) 605{ 606#if defined(_XM_NO_INTRINSICS_) 607 return V.vector4_u32[3]; 608#elif defined(_XM_ARM_NEON_INTRINSICS_) 609 return vgetq_lane_u32(V, 3); 610#elif defined(_XM_SSE_INTRINSICS_) 611 __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3)); 612 return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti)); 613#else // _XM_VMX128_INTRINSICS_ 614#endif // _XM_VMX128_INTRINSICS_ 615} 616 617//------------------------------------------------------------------------------ 618 619// Store a component indexed by i into a 32 bit integer location in memory. 620_Use_decl_annotations_ 621inline void XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i) 622{ 623 assert( x != NULL ); 624 assert( i < 4 ); 625 _Analysis_assume_( i < 4 ); 626#if defined(_XM_NO_INTRINSICS_) 627 *x = V.vector4_u32[i]; 628#elif defined(_XM_ARM_NEON_INTRINSICS_) 629 *x = V.n128_u32[i]; 630#elif defined(_XM_SSE_INTRINSICS_) 631 *x = V.m128_u32[i]; 632#else // _XM_VMX128_INTRINSICS_ 633#endif // _XM_VMX128_INTRINSICS_ 634} 635 636//------------------------------------------------------------------------------ 637 638// Store the X component into a 32 bit integer location in memory. 639_Use_decl_annotations_ 640inline void XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V) 641{ 642 assert( x != NULL ); 643#if defined(_XM_NO_INTRINSICS_) 644 *x = V.vector4_u32[0]; 645#elif defined(_XM_ARM_NEON_INTRINSICS_) 646 vst1q_lane_u32(x,V,0); 647#elif defined(_XM_SSE_INTRINSICS_) 648 _mm_store_ss(reinterpret_cast<float *>(x),V); 649#else // _XM_VMX128_INTRINSICS_ 650#endif // _XM_VMX128_INTRINSICS_ 651} 652 653// Store the Y component into a 32 bit integer location in memory. 654_Use_decl_annotations_ 655inline void XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V) 656{ 657 assert( y != NULL ); 658#if defined(_XM_NO_INTRINSICS_) 659 *y = V.vector4_u32[1]; 660#elif defined(_XM_ARM_NEON_INTRINSICS_) 661 vst1q_lane_u32(y,V,1); 662#elif defined(_XM_SSE_INTRINSICS_) 663 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 664 _mm_store_ss(reinterpret_cast<float *>(y),vResult); 665#else // _XM_VMX128_INTRINSICS_ 666#endif // _XM_VMX128_INTRINSICS_ 667} 668 669// Store the Z component into a 32 bit integer locaCantion in memory. 670_Use_decl_annotations_ 671inline void XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V) 672{ 673 assert( z != NULL ); 674#if defined(_XM_NO_INTRINSICS_) 675 *z = V.vector4_u32[2]; 676#elif defined(_XM_ARM_NEON_INTRINSICS_) 677 vst1q_lane_u32(z,V,2); 678#elif defined(_XM_SSE_INTRINSICS_) 679 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 680 _mm_store_ss(reinterpret_cast<float *>(z),vResult); 681#else // _XM_VMX128_INTRINSICS_ 682#endif // _XM_VMX128_INTRINSICS_ 683} 684 685// Store the W component into a 32 bit integer location in memory. 686_Use_decl_annotations_ 687inline void XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V) 688{ 689 assert( w != NULL ); 690#if defined(_XM_NO_INTRINSICS_) 691 *w = V.vector4_u32[3]; 692#elif defined(_XM_ARM_NEON_INTRINSICS_) 693 vst1q_lane_u32(w,V,3); 694#elif defined(_XM_SSE_INTRINSICS_) 695 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); 696 _mm_store_ss(reinterpret_cast<float *>(w),vResult); 697#else // _XM_VMX128_INTRINSICS_ 698#endif // _XM_VMX128_INTRINSICS_ 699} 700 701//------------------------------------------------------------------------------ 702 703// Set a single indexed floating point component 704inline XMVECTOR XMVectorSetByIndex(FXMVECTOR V, float f, size_t i) 705{ 706 assert( i < 4 ); 707 _Analysis_assume_( i < 4 ); 708#if defined(_XM_NO_INTRINSICS_) 709 XMVECTOR U; 710 U = V; 711 U.vector4_f32[i] = f; 712 return U; 713#elif defined(_XM_ARM_NEON_INTRINSICS_) 714 XMVECTOR U = V; 715 U.n128_f32[i] = f; 716 return U; 717#elif defined(_XM_SSE_INTRINSICS_) 718 XMVECTOR U = V; 719 U.m128_f32[i] = f; 720 return U; 721#else // _XM_VMX128_INTRINSICS_ 722#endif // _XM_VMX128_INTRINSICS_ 723} 724 725//------------------------------------------------------------------------------ 726 727// Sets the X component of a vector to a passed floating point value 728inline XMVECTOR XMVectorSetX(FXMVECTOR V, float x) 729{ 730#if defined(_XM_NO_INTRINSICS_) 731 XMVECTOR U; 732 U.vector4_f32[0] = x; 733 U.vector4_f32[1] = V.vector4_f32[1]; 734 U.vector4_f32[2] = V.vector4_f32[2]; 735 U.vector4_f32[3] = V.vector4_f32[3]; 736 return U; 737#elif defined(_XM_ARM_NEON_INTRINSICS_) 738 return vsetq_lane_f32(x,V,0); 739#elif defined(_XM_SSE_INTRINSICS_) 740 XMVECTOR vResult = _mm_set_ss(x); 741 vResult = _mm_move_ss(V,vResult); 742 return vResult; 743#else // _XM_VMX128_INTRINSICS_ 744#endif // _XM_VMX128_INTRINSICS_ 745} 746 747// Sets the Y component of a vector to a passed floating point value 748inline XMVECTOR XMVectorSetY(FXMVECTOR V, float y) 749{ 750#if defined(_XM_NO_INTRINSICS_) 751 XMVECTOR U; 752 U.vector4_f32[0] = V.vector4_f32[0]; 753 U.vector4_f32[1] = y; 754 U.vector4_f32[2] = V.vector4_f32[2]; 755 U.vector4_f32[3] = V.vector4_f32[3]; 756 return U; 757#elif defined(_XM_ARM_NEON_INTRINSICS_) 758 return vsetq_lane_f32(y,V,1); 759#elif defined(_XM_SSE_INTRINSICS_) 760 // Swap y and x 761 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); 762 // Convert input to vector 763 XMVECTOR vTemp = _mm_set_ss(y); 764 // Replace the x component 765 vResult = _mm_move_ss(vResult,vTemp); 766 // Swap y and x again 767 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); 768 return vResult; 769#else // _XM_VMX128_INTRINSICS_ 770#endif // _XM_VMX128_INTRINSICS_ 771} 772// Sets the Z component of a vector to a passed floating point value 773inline XMVECTOR XMVectorSetZ(FXMVECTOR V, float z) 774{ 775#if defined(_XM_NO_INTRINSICS_) 776 XMVECTOR U; 777 U.vector4_f32[0] = V.vector4_f32[0]; 778 U.vector4_f32[1] = V.vector4_f32[1]; 779 U.vector4_f32[2] = z; 780 U.vector4_f32[3] = V.vector4_f32[3]; 781 return U; 782#elif defined(_XM_ARM_NEON_INTRINSICS_) 783 return vsetq_lane_f32(z,V,2); 784#elif defined(_XM_SSE_INTRINSICS_) 785 // Swap z and x 786 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); 787 // Convert input to vector 788 XMVECTOR vTemp = _mm_set_ss(z); 789 // Replace the x component 790 vResult = _mm_move_ss(vResult,vTemp); 791 // Swap z and x again 792 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); 793 return vResult; 794#else // _XM_VMX128_INTRINSICS_ 795#endif // _XM_VMX128_INTRINSICS_ 796} 797 798// Sets the W component of a vector to a passed floating point value 799inline XMVECTOR XMVectorSetW(FXMVECTOR V, float w) 800{ 801#if defined(_XM_NO_INTRINSICS_) 802 XMVECTOR U; 803 U.vector4_f32[0] = V.vector4_f32[0]; 804 U.vector4_f32[1] = V.vector4_f32[1]; 805 U.vector4_f32[2] = V.vector4_f32[2]; 806 U.vector4_f32[3] = w; 807 return U; 808#elif defined(_XM_ARM_NEON_INTRINSICS_) 809 return vsetq_lane_f32(w,V,3); 810#elif defined(_XM_SSE_INTRINSICS_) 811 // Swap w and x 812 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); 813 // Convert input to vector 814 XMVECTOR vTemp = _mm_set_ss(w); 815 // Replace the x component 816 vResult = _mm_move_ss(vResult,vTemp); 817 // Swap w and x again 818 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); 819 return vResult; 820#else // _XM_VMX128_INTRINSICS_ 821#endif // _XM_VMX128_INTRINSICS_ 822} 823 824//------------------------------------------------------------------------------ 825 826// Sets a component of a vector to a floating point value passed by pointer 827_Use_decl_annotations_ 828inline XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i) 829{ 830 assert( f != NULL ); 831 assert( i < 4 ); 832 _Analysis_assume_( i < 4 ); 833#if defined(_XM_NO_INTRINSICS_) 834 XMVECTOR U; 835 U = V; 836 U.vector4_f32[i] = *f; 837 return U; 838#elif defined(_XM_ARM_NEON_INTRINSICS_) 839 XMVECTOR U = V; 840 U.n128_f32[i] = *f; 841 return U; 842#elif defined(_XM_SSE_INTRINSICS_) 843 XMVECTOR U = V; 844 U.m128_f32[i] = *f; 845 return U; 846#else // _XM_VMX128_INTRINSICS_ 847#endif // _XM_VMX128_INTRINSICS_ 848} 849 850//------------------------------------------------------------------------------ 851 852// Sets the X component of a vector to a floating point value passed by pointer 853_Use_decl_annotations_ 854inline XMVECTOR XMVectorSetXPtr(FXMVECTOR V, const float *x) 855{ 856 assert( x != NULL ); 857#if defined(_XM_NO_INTRINSICS_) 858 XMVECTOR U; 859 U.vector4_f32[0] = *x; 860 U.vector4_f32[1] = V.vector4_f32[1]; 861 U.vector4_f32[2] = V.vector4_f32[2]; 862 U.vector4_f32[3] = V.vector4_f32[3]; 863 return U; 864#elif defined(_XM_ARM_NEON_INTRINSICS_) 865 return vld1q_lane_f32(x,V,0); 866#elif defined(_XM_SSE_INTRINSICS_) 867 XMVECTOR vResult = _mm_load_ss(x); 868 vResult = _mm_move_ss(V,vResult); 869 return vResult; 870#else // _XM_VMX128_INTRINSICS_ 871#endif // _XM_VMX128_INTRINSICS_ 872} 873 874// Sets the Y component of a vector to a floating point value passed by pointer 875_Use_decl_annotations_ 876inline XMVECTOR XMVectorSetYPtr(FXMVECTOR V, const float *y) 877{ 878 assert( y != NULL ); 879#if defined(_XM_NO_INTRINSICS_) 880 XMVECTOR U; 881 U.vector4_f32[0] = V.vector4_f32[0]; 882 U.vector4_f32[1] = *y; 883 U.vector4_f32[2] = V.vector4_f32[2]; 884 U.vector4_f32[3] = V.vector4_f32[3]; 885 return U; 886#elif defined(_XM_ARM_NEON_INTRINSICS_) 887 return vld1q_lane_f32(y,V,1); 888#elif defined(_XM_SSE_INTRINSICS_) 889 // Swap y and x 890 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); 891 // Convert input to vector 892 XMVECTOR vTemp = _mm_load_ss(y); 893 // Replace the x component 894 vResult = _mm_move_ss(vResult,vTemp); 895 // Swap y and x again 896 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); 897 return vResult; 898#else // _XM_VMX128_INTRINSICS_ 899#endif // _XM_VMX128_INTRINSICS_ 900} 901 902// Sets the Z component of a vector to a floating point value passed by pointer 903_Use_decl_annotations_ 904inline XMVECTOR XMVectorSetZPtr(FXMVECTOR V, const float *z) 905{ 906 assert( z != NULL ); 907#if defined(_XM_NO_INTRINSICS_) 908 XMVECTOR U; 909 U.vector4_f32[0] = V.vector4_f32[0]; 910 U.vector4_f32[1] = V.vector4_f32[1]; 911 U.vector4_f32[2] = *z; 912 U.vector4_f32[3] = V.vector4_f32[3]; 913 return U; 914#elif defined(_XM_ARM_NEON_INTRINSICS_) 915 return vld1q_lane_f32(z,V,2); 916#elif defined(_XM_SSE_INTRINSICS_) 917 // Swap z and x 918 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); 919 // Convert input to vector 920 XMVECTOR vTemp = _mm_load_ss(z); 921 // Replace the x component 922 vResult = _mm_move_ss(vResult,vTemp); 923 // Swap z and x again 924 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); 925 return vResult; 926#else // _XM_VMX128_INTRINSICS_ 927#endif // _XM_VMX128_INTRINSICS_ 928} 929 930// Sets the W component of a vector to a floating point value passed by pointer 931_Use_decl_annotations_ 932inline XMVECTOR XMVectorSetWPtr(FXMVECTOR V, const float *w) 933{ 934 assert( w != NULL ); 935#if defined(_XM_NO_INTRINSICS_) 936 XMVECTOR U; 937 U.vector4_f32[0] = V.vector4_f32[0]; 938 U.vector4_f32[1] = V.vector4_f32[1]; 939 U.vector4_f32[2] = V.vector4_f32[2]; 940 U.vector4_f32[3] = *w; 941 return U; 942#elif defined(_XM_ARM_NEON_INTRINSICS_) 943 return vld1q_lane_f32(w,V,3); 944#elif defined(_XM_SSE_INTRINSICS_) 945 // Swap w and x 946 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); 947 // Convert input to vector 948 XMVECTOR vTemp = _mm_load_ss(w); 949 // Replace the x component 950 vResult = _mm_move_ss(vResult,vTemp); 951 // Swap w and x again 952 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); 953 return vResult; 954#else // _XM_VMX128_INTRINSICS_ 955#endif // _XM_VMX128_INTRINSICS_ 956} 957 958//------------------------------------------------------------------------------ 959 960// Sets a component of a vector to an integer passed by value 961inline XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i) 962{ 963 assert( i < 4 ); 964 _Analysis_assume_( i < 4 ); 965#if defined(_XM_NO_INTRINSICS_) 966 XMVECTOR U; 967 U = V; 968 U.vector4_u32[i] = x; 969 return U; 970#elif defined(_XM_ARM_NEON_INTRINSICS_) 971 XMVECTORU32 tmp; 972 tmp.v = V; 973 tmp.u[i] = x; 974 return tmp; 975#elif defined(_XM_SSE_INTRINSICS_) 976 XMVECTORU32 tmp; 977 tmp.v = V; 978 tmp.u[i] = x; 979 return tmp; 980#else // _XM_VMX128_INTRINSICS_ 981#endif // _XM_VMX128_INTRINSICS_ 982} 983 984//------------------------------------------------------------------------------ 985 986// Sets the X component of a vector to an integer passed by value 987inline XMVECTOR XMVectorSetIntX(FXMVECTOR V, uint32_t x) 988{ 989#if defined(_XM_NO_INTRINSICS_) 990 XMVECTOR U; 991 U.vector4_u32[0] = x; 992 U.vector4_u32[1] = V.vector4_u32[1]; 993 U.vector4_u32[2] = V.vector4_u32[2]; 994 U.vector4_u32[3] = V.vector4_u32[3]; 995 return U; 996#elif defined(_XM_ARM_NEON_INTRINSICS_) 997 return vsetq_lane_u32(x,V,0); 998#elif defined(_XM_SSE_INTRINSICS_) 999 __m128i vTemp = _mm_cvtsi32_si128(x); 1000 XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp)); 1001 return vResult; 1002#else // _XM_VMX128_INTRINSICS_ 1003#endif // _XM_VMX128_INTRINSICS_ 1004} 1005 1006// Sets the Y component of a vector to an integer passed by value 1007inline XMVECTOR XMVectorSetIntY(FXMVECTOR V, uint32_t y) 1008{ 1009#if defined(_XM_NO_INTRINSICS_) 1010 XMVECTOR U; 1011 U.vector4_u32[0] = V.vector4_u32[0]; 1012 U.vector4_u32[1] = y; 1013 U.vector4_u32[2] = V.vector4_u32[2]; 1014 U.vector4_u32[3] = V.vector4_u32[3]; 1015 return U; 1016#elif defined(_XM_ARM_NEON_INTRINSICS_) 1017 return vsetq_lane_u32(y,V,1); 1018#elif defined(_XM_SSE_INTRINSICS_) 1019 // Swap y and x 1020 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); 1021 // Convert input to vector 1022 __m128i vTemp = _mm_cvtsi32_si128(y); 1023 // Replace the x component 1024 vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); 1025 // Swap y and x again 1026 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); 1027 return vResult; 1028#else // _XM_VMX128_INTRINSICS_ 1029#endif // _XM_VMX128_INTRINSICS_ 1030} 1031 1032// Sets the Z component of a vector to an integer passed by value 1033inline XMVECTOR XMVectorSetIntZ(FXMVECTOR V, uint32_t z) 1034{ 1035#if defined(_XM_NO_INTRINSICS_) 1036 XMVECTOR U; 1037 U.vector4_u32[0] = V.vector4_u32[0]; 1038 U.vector4_u32[1] = V.vector4_u32[1]; 1039 U.vector4_u32[2] = z; 1040 U.vector4_u32[3] = V.vector4_u32[3]; 1041 return U; 1042#elif defined(_XM_ARM_NEON_INTRINSICS_) 1043 return vsetq_lane_u32(z,V,2); 1044#elif defined(_XM_SSE_INTRINSICS_) 1045 // Swap z and x 1046 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); 1047 // Convert input to vector 1048 __m128i vTemp = _mm_cvtsi32_si128(z); 1049 // Replace the x component 1050 vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); 1051 // Swap z and x again 1052 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); 1053 return vResult; 1054#else // _XM_VMX128_INTRINSICS_ 1055#endif // _XM_VMX128_INTRINSICS_ 1056} 1057 1058// Sets the W component of a vector to an integer passed by value 1059inline XMVECTOR XMVectorSetIntW(FXMVECTOR V, uint32_t w) 1060{ 1061#if defined(_XM_NO_INTRINSICS_) 1062 XMVECTOR U; 1063 U.vector4_u32[0] = V.vector4_u32[0]; 1064 U.vector4_u32[1] = V.vector4_u32[1]; 1065 U.vector4_u32[2] = V.vector4_u32[2]; 1066 U.vector4_u32[3] = w; 1067 return U; 1068#elif defined(_XM_ARM_NEON_INTRINSICS_) 1069 return vsetq_lane_u32(w,V,3); 1070#elif defined(_XM_SSE_INTRINSICS_) 1071 // Swap w and x 1072 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); 1073 // Convert input to vector 1074 __m128i vTemp = _mm_cvtsi32_si128(w); 1075 // Replace the x component 1076 vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp)); 1077 // Swap w and x again 1078 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); 1079 return vResult; 1080#else // _XM_VMX128_INTRINSICS_ 1081#endif // _XM_VMX128_INTRINSICS_ 1082} 1083 1084//------------------------------------------------------------------------------ 1085 1086// Sets a component of a vector to an integer value passed by pointer 1087_Use_decl_annotations_ 1088inline XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i) 1089{ 1090 assert( x != NULL ); 1091 assert( i < 4 ); 1092 _Analysis_assume_( i < 4 ); 1093#if defined(_XM_NO_INTRINSICS_) 1094 XMVECTOR U; 1095 U = V; 1096 U.vector4_u32[i] = *x; 1097 return U; 1098#elif defined(_XM_ARM_NEON_INTRINSICS_) 1099 XMVECTORU32 tmp; 1100 tmp.v = V; 1101 tmp.u[i] = *x; 1102 return tmp; 1103#elif defined(_XM_SSE_INTRINSICS_) 1104 XMVECTORU32 tmp; 1105 tmp.v = V; 1106 tmp.u[i] = *x; 1107 return tmp; 1108#else // _XM_VMX128_INTRINSICS_ 1109#endif // _XM_VMX128_INTRINSICS_ 1110} 1111 1112//------------------------------------------------------------------------------ 1113 1114// Sets the X component of a vector to an integer value passed by pointer 1115_Use_decl_annotations_ 1116inline XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x) 1117{ 1118 assert( x != NULL ); 1119#if defined(_XM_NO_INTRINSICS_) 1120 XMVECTOR U; 1121 U.vector4_u32[0] = *x; 1122 U.vector4_u32[1] = V.vector4_u32[1]; 1123 U.vector4_u32[2] = V.vector4_u32[2]; 1124 U.vector4_u32[3] = V.vector4_u32[3]; 1125 return U; 1126#elif defined(_XM_ARM_NEON_INTRINSICS_) 1127 return vld1q_lane_u32(x,V,0); 1128#elif defined(_XM_SSE_INTRINSICS_) 1129 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x)); 1130 XMVECTOR vResult = _mm_move_ss(V,vTemp); 1131 return vResult; 1132#else // _XM_VMX128_INTRINSICS_ 1133#endif // _XM_VMX128_INTRINSICS_ 1134} 1135 1136// Sets the Y component of a vector to an integer value passed by pointer 1137_Use_decl_annotations_ 1138inline XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y) 1139{ 1140 assert( y != NULL ); 1141#if defined(_XM_NO_INTRINSICS_) 1142 XMVECTOR U; 1143 U.vector4_u32[0] = V.vector4_u32[0]; 1144 U.vector4_u32[1] = *y; 1145 U.vector4_u32[2] = V.vector4_u32[2]; 1146 U.vector4_u32[3] = V.vector4_u32[3]; 1147 return U; 1148#elif defined(_XM_ARM_NEON_INTRINSICS_) 1149 return vld1q_lane_u32(y,V,1); 1150#elif defined(_XM_SSE_INTRINSICS_) 1151 // Swap y and x 1152 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); 1153 // Convert input to vector 1154 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y)); 1155 // Replace the x component 1156 vResult = _mm_move_ss(vResult,vTemp); 1157 // Swap y and x again 1158 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1)); 1159 return vResult; 1160#else // _XM_VMX128_INTRINSICS_ 1161#endif // _XM_VMX128_INTRINSICS_ 1162} 1163 1164// Sets the Z component of a vector to an integer value passed by pointer 1165_Use_decl_annotations_ 1166inline XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z) 1167{ 1168 assert( z != NULL ); 1169#if defined(_XM_NO_INTRINSICS_) 1170 XMVECTOR U; 1171 U.vector4_u32[0] = V.vector4_u32[0]; 1172 U.vector4_u32[1] = V.vector4_u32[1]; 1173 U.vector4_u32[2] = *z; 1174 U.vector4_u32[3] = V.vector4_u32[3]; 1175 return U; 1176#elif defined(_XM_ARM_NEON_INTRINSICS_) 1177 return vld1q_lane_u32(z,V,2); 1178#elif defined(_XM_SSE_INTRINSICS_) 1179 // Swap z and x 1180 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2)); 1181 // Convert input to vector 1182 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z)); 1183 // Replace the x component 1184 vResult = _mm_move_ss(vResult,vTemp); 1185 // Swap z and x again 1186 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2)); 1187 return vResult; 1188#else // _XM_VMX128_INTRINSICS_ 1189#endif // _XM_VMX128_INTRINSICS_ 1190} 1191 1192// Sets the W component of a vector to an integer value passed by pointer 1193_Use_decl_annotations_ 1194inline XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w) 1195{ 1196 assert( w != NULL ); 1197#if defined(_XM_NO_INTRINSICS_) 1198 XMVECTOR U; 1199 U.vector4_u32[0] = V.vector4_u32[0]; 1200 U.vector4_u32[1] = V.vector4_u32[1]; 1201 U.vector4_u32[2] = V.vector4_u32[2]; 1202 U.vector4_u32[3] = *w; 1203 return U; 1204#elif defined(_XM_ARM_NEON_INTRINSICS_) 1205 return vld1q_lane_u32(w,V,3); 1206#elif defined(_XM_SSE_INTRINSICS_) 1207 // Swap w and x 1208 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3)); 1209 // Convert input to vector 1210 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w)); 1211 // Replace the x component 1212 vResult = _mm_move_ss(vResult,vTemp); 1213 // Swap w and x again 1214 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3)); 1215 return vResult; 1216#else // _XM_VMX128_INTRINSICS_ 1217#endif // _XM_VMX128_INTRINSICS_ 1218} 1219 1220//------------------------------------------------------------------------------ 1221 1222inline XMVECTOR XMVectorSwizzle 1223( 1224 FXMVECTOR V, 1225 uint32_t E0, 1226 uint32_t E1, 1227 uint32_t E2, 1228 uint32_t E3 1229) 1230{ 1231 assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); 1232 _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) ); 1233#if defined(_XM_NO_INTRINSICS_) 1234 1235 XMVECTOR Result = { V.vector4_f32[E0], 1236 V.vector4_f32[E1], 1237 V.vector4_f32[E2], 1238 V.vector4_f32[E3] }; 1239 return Result; 1240 1241#elif defined(_XM_ARM_NEON_INTRINSICS_) 1242 static const uint32_t ControlElement[ 4 ] = 1243 { 1244#ifdef _XM_LITTLEENDIAN_ 1245 0x03020100, // XM_SWIZZLE_X 1246 0x07060504, // XM_SWIZZLE_Y 1247 0x0B0A0908, // XM_SWIZZLE_Z 1248 0x0F0E0D0C, // XM_SWIZZLE_W 1249#else 1250 0x00010203, // XM_SWIZZLE_X 1251 0x04050607, // XM_SWIZZLE_Y 1252 0x08090A0B, // XM_SWIZZLE_Z 1253 0x0C0D0E0F, // XM_SWIZZLE_W 1254#endif 1255 }; 1256 1257 int8x8x2_t tbl; 1258 tbl.val[0] = vget_low_f32(V); 1259 tbl.val[1] = vget_high_f32(V); 1260 1261 __n64 idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) ); 1262 const __n64 rL = vtbl2_u8( tbl, idx ); 1263 1264 idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) ); 1265 const __n64 rH = vtbl2_u8( tbl, idx ); 1266 1267 return vcombine_f32( rL, rH ); 1268#elif defined(_XM_VMX128_INTRINSICS_) 1269#else 1270 const uint32_t *aPtr = (const uint32_t* )(&V); 1271 1272 XMVECTOR Result; 1273 uint32_t *pWork = (uint32_t*)(&Result); 1274 1275 pWork[0] = aPtr[E0]; 1276 pWork[1] = aPtr[E1]; 1277 pWork[2] = aPtr[E2]; 1278 pWork[3] = aPtr[E3]; 1279 1280 return Result; 1281#endif 1282} 1283 1284//------------------------------------------------------------------------------ 1285inline XMVECTOR XMVectorPermute 1286( 1287 FXMVECTOR V1, 1288 FXMVECTOR V2, 1289 uint32_t PermuteX, 1290 uint32_t PermuteY, 1291 uint32_t PermuteZ, 1292 uint32_t PermuteW 1293) 1294{ 1295 assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); 1296 _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 ); 1297 1298#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) 1299 static const uint32_t ControlElement[ 8 ] = 1300 { 1301#ifdef _XM_LITTLEENDIAN_ 1302 0x03020100, // XM_PERMUTE_0X 1303 0x07060504, // XM_PERMUTE_0Y 1304 0x0B0A0908, // XM_PERMUTE_0Z 1305 0x0F0E0D0C, // XM_PERMUTE_0W 1306 0x13121110, // XM_PERMUTE_1X 1307 0x17161514, // XM_PERMUTE_1Y 1308 0x1B1A1918, // XM_PERMUTE_1Z 1309 0x1F1E1D1C, // XM_PERMUTE_1W 1310#else 1311 0x00010203, // XM_PERMUTE_0X 1312 0x04050607, // XM_PERMUTE_0Y 1313 0x08090A0B, // XM_PERMUTE_0Z 1314 0x0C0D0E0F, // XM_PERMUTE_0W 1315 0x10111213, // XM_PERMUTE_1X 1316 0x14151617, // XM_PERMUTE_1Y 1317 0x18191A1B, // XM_PERMUTE_1Z 1318 0x1C1D1E1F, // XM_PERMUTE_1W 1319#endif 1320 }; 1321 1322 int8x8x4_t tbl; 1323 tbl.val[0] = vget_low_f32(V1); 1324 tbl.val[1] = vget_high_f32(V1); 1325 tbl.val[2] = vget_low_f32(V2); 1326 tbl.val[3] = vget_high_f32(V2); 1327 1328 __n64 idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) ); 1329 const __n64 rL = vtbl4_u8( tbl, idx ); 1330 1331 idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) ); 1332 const __n64 rH = vtbl4_u8( tbl, idx ); 1333 1334 return vcombine_f32( rL, rH ); 1335#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) 1336#else 1337 1338 const uint32_t *aPtr[2]; 1339 aPtr[0] = (const uint32_t* )(&V1); 1340 aPtr[1] = (const uint32_t* )(&V2); 1341 1342 XMVECTOR Result; 1343 uint32_t *pWork = (uint32_t*)(&Result); 1344 1345 const uint32_t i0 = PermuteX & 3; 1346 const uint32_t vi0 = PermuteX >> 2; 1347 pWork[0] = aPtr[vi0][i0]; 1348 1349 const uint32_t i1 = PermuteY & 3; 1350 const uint32_t vi1 = PermuteY >> 2; 1351 pWork[1] = aPtr[vi1][i1]; 1352 1353 const uint32_t i2 = PermuteZ & 3; 1354 const uint32_t vi2 = PermuteZ >> 2; 1355 pWork[2] = aPtr[vi2][i2]; 1356 1357 const uint32_t i3 = PermuteW & 3; 1358 const uint32_t vi3 = PermuteW >> 2; 1359 pWork[3] = aPtr[vi3][i3]; 1360 1361 return Result; 1362#endif 1363} 1364 1365//------------------------------------------------------------------------------ 1366// Define a control vector to be used in XMVectorSelect 1367// operations. The four integers specified in XMVectorSelectControl 1368// serve as indices to select between components in two vectors. 1369// The first index controls selection for the first component of 1370// the vectors involved in a select operation, the second index 1371// controls selection for the second component etc. A value of 1372// zero for an index causes the corresponding component from the first 1373// vector to be selected whereas a one causes the component from the 1374// second vector to be selected instead. 1375 1376inline XMVECTOR XMVectorSelectControl 1377( 1378 uint32_t VectorIndex0, 1379 uint32_t VectorIndex1, 1380 uint32_t VectorIndex2, 1381 uint32_t VectorIndex3 1382) 1383{ 1384#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) 1385 // x=Index0,y=Index1,z=Index2,w=Index3 1386 __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0); 1387 // Any non-zero entries become 0xFFFFFFFF else 0 1388 vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero); 1389 return reinterpret_cast<__m128 *>(&vTemp)[0]; 1390#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) 1391 __n64 V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32)); 1392 __n64 V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32)); 1393 __n128 vTemp = vcombine_s32(V0, V1); 1394 // Any non-zero entries become 0xFFFFFFFF else 0 1395 return vcgtq_s32(vTemp,g_XMZero); 1396#else 1397 XMVECTOR ControlVector; 1398 const uint32_t ControlElement[] = 1399 { 1400 XM_SELECT_0, 1401 XM_SELECT_1 1402 }; 1403 1404 assert(VectorIndex0 < 2); 1405 assert(VectorIndex1 < 2); 1406 assert(VectorIndex2 < 2); 1407 assert(VectorIndex3 < 2); 1408 _Analysis_assume_(VectorIndex0 < 2); 1409 _Analysis_assume_(VectorIndex1 < 2); 1410 _Analysis_assume_(VectorIndex2 < 2); 1411 _Analysis_assume_(VectorIndex3 < 2); 1412 1413 ControlVector.vector4_u32[0] = ControlElement[VectorIndex0]; 1414 ControlVector.vector4_u32[1] = ControlElement[VectorIndex1]; 1415 ControlVector.vector4_u32[2] = ControlElement[VectorIndex2]; 1416 ControlVector.vector4_u32[3] = ControlElement[VectorIndex3]; 1417 1418 return ControlVector; 1419 1420#endif 1421} 1422 1423//------------------------------------------------------------------------------ 1424 1425inline XMVECTOR XMVectorSelect 1426( 1427 FXMVECTOR V1, 1428 FXMVECTOR V2, 1429 FXMVECTOR Control 1430) 1431{ 1432#if defined(_XM_NO_INTRINSICS_) 1433 1434 XMVECTOR Result; 1435 Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]); 1436 Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]); 1437 Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]); 1438 Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]); 1439 return Result; 1440 1441#elif defined(_XM_ARM_NEON_INTRINSICS_) 1442 return vbslq_f32( Control, V2, V1 ); 1443#elif defined(_XM_SSE_INTRINSICS_) 1444 XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1); 1445 XMVECTOR vTemp2 = _mm_and_ps(V2,Control); 1446 return _mm_or_ps(vTemp1,vTemp2); 1447#else // _XM_VMX128_INTRINSICS_ 1448#endif // _XM_VMX128_INTRINSICS_ 1449} 1450 1451//------------------------------------------------------------------------------ 1452 1453inline XMVECTOR XMVectorMergeXY 1454( 1455 FXMVECTOR V1, 1456 FXMVECTOR V2 1457) 1458{ 1459#if defined(_XM_NO_INTRINSICS_) 1460 1461 XMVECTOR Result; 1462 Result.vector4_u32[0] = V1.vector4_u32[0]; 1463 Result.vector4_u32[1] = V2.vector4_u32[0]; 1464 Result.vector4_u32[2] = V1.vector4_u32[1]; 1465 Result.vector4_u32[3] = V2.vector4_u32[1]; 1466 return Result; 1467 1468#elif defined(_XM_ARM_NEON_INTRINSICS_) 1469 return vzipq_f32( V1, V2 ).val[0]; 1470#elif defined(_XM_SSE_INTRINSICS_) 1471 return _mm_unpacklo_ps( V1, V2 ); 1472#else // _XM_VMX128_INTRINSICS_ 1473#endif // _XM_VMX128_INTRINSICS_ 1474} 1475 1476//------------------------------------------------------------------------------ 1477 1478inline XMVECTOR XMVectorMergeZW 1479( 1480 FXMVECTOR V1, 1481 FXMVECTOR V2 1482) 1483{ 1484#if defined(_XM_NO_INTRINSICS_) 1485 1486 XMVECTOR Result; 1487 Result.vector4_u32[0] = V1.vector4_u32[2]; 1488 Result.vector4_u32[1] = V2.vector4_u32[2]; 1489 Result.vector4_u32[2] = V1.vector4_u32[3]; 1490 Result.vector4_u32[3] = V2.vector4_u32[3]; 1491 return Result; 1492 1493#elif defined(_XM_ARM_NEON_INTRINSICS_) 1494 return vzipq_f32( V1, V2 ).val[1]; 1495#elif defined(_XM_SSE_INTRINSICS_) 1496 return _mm_unpackhi_ps( V1, V2 ); 1497#else // _XM_VMX128_INTRINSICS_ 1498#endif // _XM_VMX128_INTRINSICS_ 1499} 1500 1501//------------------------------------------------------------------------------ 1502 1503inline XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements) 1504{ 1505 assert( Elements < 4 ); 1506 _Analysis_assume_( Elements < 4 ); 1507 return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3)); 1508} 1509 1510//------------------------------------------------------------------------------ 1511 1512inline XMVECTOR XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements) 1513{ 1514 assert( Elements < 4 ); 1515 _Analysis_assume_( Elements < 4 ); 1516 return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 ); 1517} 1518 1519//------------------------------------------------------------------------------ 1520 1521inline XMVECTOR XMVectorRotateRight(FXMVECTOR V, uint32_t Elements) 1522{ 1523 assert( Elements < 4 ); 1524 _Analysis_assume_( Elements < 4 ); 1525 return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 ); 1526} 1527 1528//------------------------------------------------------------------------------ 1529 1530inline XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements, 1531 uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3) 1532{ 1533 XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1); 1534 return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control ); 1535} 1536 1537//------------------------------------------------------------------------------ 1538// Comparison operations 1539//------------------------------------------------------------------------------ 1540 1541//------------------------------------------------------------------------------ 1542 1543inline XMVECTOR XMVectorEqual 1544( 1545 FXMVECTOR V1, 1546 FXMVECTOR V2 1547) 1548{ 1549#if defined(_XM_NO_INTRINSICS_) 1550 1551 XMVECTOR Control; 1552 Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; 1553 Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; 1554 Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; 1555 Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; 1556 return Control; 1557 1558#elif defined(_XM_ARM_NEON_INTRINSICS_) 1559 return vceqq_f32( V1, V2 ); 1560#elif defined(_XM_SSE_INTRINSICS_) 1561 return _mm_cmpeq_ps( V1, V2 ); 1562#else // _XM_VMX128_INTRINSICS_ 1563#endif // _XM_VMX128_INTRINSICS_ 1564} 1565 1566//------------------------------------------------------------------------------ 1567 1568_Use_decl_annotations_ 1569inline XMVECTOR XMVectorEqualR 1570( 1571 uint32_t* pCR, 1572 FXMVECTOR V1, 1573 FXMVECTOR V2 1574) 1575{ 1576 assert( pCR != NULL ); 1577#if defined(_XM_NO_INTRINSICS_) 1578 uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; 1579 uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; 1580 uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; 1581 uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; 1582 uint32_t CR = 0; 1583 if (ux&uy&uz&uw) 1584 { 1585 // All elements are greater 1586 CR = XM_CRMASK_CR6TRUE; 1587 } 1588 else if (!(ux|uy|uz|uw)) 1589 { 1590 // All elements are not greater 1591 CR = XM_CRMASK_CR6FALSE; 1592 } 1593 *pCR = CR; 1594 1595 XMVECTOR Control; 1596 Control.vector4_u32[0] = ux; 1597 Control.vector4_u32[1] = uy; 1598 Control.vector4_u32[2] = uz; 1599 Control.vector4_u32[3] = uw; 1600 return Control; 1601 1602#elif defined(_XM_ARM_NEON_INTRINSICS_) 1603 __n128 vResult = vceqq_f32( V1, V2 ); 1604 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 1605 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 1606 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 1607 uint32_t CR = 0; 1608 if ( r == 0xFFFFFFFFU ) 1609 { 1610 // All elements are equal 1611 CR = XM_CRMASK_CR6TRUE; 1612 } 1613 else if ( !r ) 1614 { 1615 // All elements are not equal 1616 CR = XM_CRMASK_CR6FALSE; 1617 } 1618 *pCR = CR; 1619 return vResult; 1620#elif defined(_XM_SSE_INTRINSICS_) 1621 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 1622 uint32_t CR = 0; 1623 int iTest = _mm_movemask_ps(vTemp); 1624 if (iTest==0xf) 1625 { 1626 CR = XM_CRMASK_CR6TRUE; 1627 } 1628 else if (!iTest) 1629 { 1630 // All elements are not greater 1631 CR = XM_CRMASK_CR6FALSE; 1632 } 1633 *pCR = CR; 1634 return vTemp; 1635#else // _XM_VMX128_INTRINSICS_ 1636#endif // _XM_VMX128_INTRINSICS_ 1637} 1638 1639//------------------------------------------------------------------------------ 1640// Treat the components of the vectors as unsigned integers and 1641// compare individual bits between the two. This is useful for 1642// comparing control vectors and result vectors returned from 1643// other comparison operations. 1644 1645inline XMVECTOR XMVectorEqualInt 1646( 1647 FXMVECTOR V1, 1648 FXMVECTOR V2 1649) 1650{ 1651#if defined(_XM_NO_INTRINSICS_) 1652 1653 XMVECTOR Control; 1654 Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0; 1655 Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0; 1656 Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0; 1657 Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0; 1658 return Control; 1659 1660#elif defined(_XM_ARM_NEON_INTRINSICS_) 1661 return vceqq_u32( V1, V2 ); 1662#elif defined(_XM_SSE_INTRINSICS_) 1663 __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); 1664 return reinterpret_cast<__m128 *>(&V)[0]; 1665#else // _XM_VMX128_INTRINSICS_ 1666#endif // _XM_VMX128_INTRINSICS_ 1667} 1668 1669//------------------------------------------------------------------------------ 1670 1671_Use_decl_annotations_ 1672inline XMVECTOR XMVectorEqualIntR 1673( 1674 uint32_t* pCR, 1675 FXMVECTOR V1, 1676 FXMVECTOR V2 1677) 1678{ 1679 assert( pCR != NULL ); 1680#if defined(_XM_NO_INTRINSICS_) 1681 1682 XMVECTOR Control = XMVectorEqualInt(V1, V2); 1683 1684 *pCR = 0; 1685 if (XMVector4EqualInt(Control, XMVectorTrueInt())) 1686 { 1687 // All elements are equal 1688 *pCR |= XM_CRMASK_CR6TRUE; 1689 } 1690 else if (XMVector4EqualInt(Control, XMVectorFalseInt())) 1691 { 1692 // All elements are not equal 1693 *pCR |= XM_CRMASK_CR6FALSE; 1694 } 1695 return Control; 1696 1697#elif defined(_XM_ARM_NEON_INTRINSICS_) 1698 __n128 vResult = vceqq_u32( V1, V2 ); 1699 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 1700 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 1701 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 1702 uint32_t CR = 0; 1703 if ( r == 0xFFFFFFFFU ) 1704 { 1705 // All elements are equal 1706 CR = XM_CRMASK_CR6TRUE; 1707 } 1708 else if ( !r ) 1709 { 1710 // All elements are not equal 1711 CR = XM_CRMASK_CR6FALSE; 1712 } 1713 *pCR = CR; 1714 return vResult; 1715#elif defined(_XM_SSE_INTRINSICS_) 1716 __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); 1717 int iTemp = _mm_movemask_ps(reinterpret_cast<const __m128*>(&V)[0]); 1718 uint32_t CR = 0; 1719 if (iTemp==0x0F) 1720 { 1721 CR = XM_CRMASK_CR6TRUE; 1722 } 1723 else if (!iTemp) 1724 { 1725 CR = XM_CRMASK_CR6FALSE; 1726 } 1727 *pCR = CR; 1728 return reinterpret_cast<__m128 *>(&V)[0]; 1729#else // _XM_VMX128_INTRINSICS_ 1730#endif // _XM_VMX128_INTRINSICS_ 1731} 1732 1733//------------------------------------------------------------------------------ 1734 1735inline XMVECTOR XMVectorNearEqual 1736( 1737 FXMVECTOR V1, 1738 FXMVECTOR V2, 1739 FXMVECTOR Epsilon 1740) 1741{ 1742#if defined(_XM_NO_INTRINSICS_) 1743 1744 float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0]; 1745 float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1]; 1746 float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2]; 1747 float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3]; 1748 1749 fDeltax = fabsf(fDeltax); 1750 fDeltay = fabsf(fDeltay); 1751 fDeltaz = fabsf(fDeltaz); 1752 fDeltaw = fabsf(fDeltaw); 1753 1754 XMVECTOR Control; 1755 Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0; 1756 Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0; 1757 Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0; 1758 Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0; 1759 return Control; 1760 1761#elif defined(_XM_ARM_NEON_INTRINSICS_) 1762 XMVECTOR vDelta = vsubq_f32(V1,V2); 1763 return vacleq_f32( vDelta, Epsilon ); 1764#elif defined(_XM_SSE_INTRINSICS_) 1765 // Get the difference 1766 XMVECTOR vDelta = _mm_sub_ps(V1,V2); 1767 // Get the absolute value of the difference 1768 XMVECTOR vTemp = _mm_setzero_ps(); 1769 vTemp = _mm_sub_ps(vTemp,vDelta); 1770 vTemp = _mm_max_ps(vTemp,vDelta); 1771 vTemp = _mm_cmple_ps(vTemp,Epsilon); 1772 return vTemp; 1773#else // _XM_VMX128_INTRINSICS_ 1774#endif // _XM_VMX128_INTRINSICS_ 1775} 1776 1777//------------------------------------------------------------------------------ 1778 1779inline XMVECTOR XMVectorNotEqual 1780( 1781 FXMVECTOR V1, 1782 FXMVECTOR V2 1783) 1784{ 1785#if defined(_XM_NO_INTRINSICS_) 1786 1787 XMVECTOR Control; 1788 Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; 1789 Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; 1790 Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; 1791 Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; 1792 return Control; 1793 1794#elif defined(_XM_ARM_NEON_INTRINSICS_) 1795 return vmvnq_u32(vceqq_f32(V1, V2)); 1796#elif defined(_XM_SSE_INTRINSICS_) 1797 return _mm_cmpneq_ps( V1, V2 ); 1798#else // _XM_VMX128_INTRINSICS_ 1799#endif // _XM_VMX128_INTRINSICS_ 1800} 1801 1802//------------------------------------------------------------------------------ 1803 1804inline XMVECTOR XMVectorNotEqualInt 1805( 1806 FXMVECTOR V1, 1807 FXMVECTOR V2 1808) 1809{ 1810#if defined(_XM_NO_INTRINSICS_) 1811 1812 XMVECTOR Control; 1813 Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0; 1814 Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0; 1815 Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0; 1816 Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0; 1817 return Control; 1818 1819#elif defined(_XM_ARM_NEON_INTRINSICS_) 1820 return vmvnq_u32(vceqq_u32(V1, V2)); 1821#elif defined(_XM_SSE_INTRINSICS_) 1822 __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) ); 1823 return _mm_xor_ps(reinterpret_cast<__m128 *>(&V)[0],g_XMNegOneMask); 1824#else // _XM_VMX128_INTRINSICS_ 1825#endif // _XM_VMX128_INTRINSICS_ 1826} 1827 1828//------------------------------------------------------------------------------ 1829 1830inline XMVECTOR XMVectorGreater 1831( 1832 FXMVECTOR V1, 1833 FXMVECTOR V2 1834) 1835{ 1836#if defined(_XM_NO_INTRINSICS_) 1837 1838 XMVECTOR Control; 1839 Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; 1840 Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; 1841 Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; 1842 Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; 1843 return Control; 1844 1845#elif defined(_XM_ARM_NEON_INTRINSICS_) 1846 return vcgtq_f32( V1, V2 ); 1847#elif defined(_XM_SSE_INTRINSICS_) 1848 return _mm_cmpgt_ps( V1, V2 ); 1849#else // _XM_VMX128_INTRINSICS_ 1850#endif // _XM_VMX128_INTRINSICS_ 1851} 1852 1853//------------------------------------------------------------------------------ 1854 1855_Use_decl_annotations_ 1856inline XMVECTOR XMVectorGreaterR 1857( 1858 uint32_t* pCR, 1859 FXMVECTOR V1, 1860 FXMVECTOR V2 1861) 1862{ 1863 assert( pCR != NULL ); 1864#if defined(_XM_NO_INTRINSICS_) 1865 1866 uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; 1867 uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; 1868 uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; 1869 uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; 1870 uint32_t CR = 0; 1871 if (ux&uy&uz&uw) 1872 { 1873 // All elements are greater 1874 CR = XM_CRMASK_CR6TRUE; 1875 } 1876 else if (!(ux|uy|uz|uw)) 1877 { 1878 // All elements are not greater 1879 CR = XM_CRMASK_CR6FALSE; 1880 } 1881 *pCR = CR; 1882 1883 XMVECTOR Control; 1884 Control.vector4_u32[0] = ux; 1885 Control.vector4_u32[1] = uy; 1886 Control.vector4_u32[2] = uz; 1887 Control.vector4_u32[3] = uw; 1888 return Control; 1889 1890#elif defined(_XM_ARM_NEON_INTRINSICS_) 1891 __n128 vResult = vcgtq_f32( V1, V2 ); 1892 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 1893 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 1894 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 1895 uint32_t CR = 0; 1896 if ( r == 0xFFFFFFFFU ) 1897 { 1898 // All elements are greater 1899 CR = XM_CRMASK_CR6TRUE; 1900 } 1901 else if ( !r ) 1902 { 1903 // All elements are not greater 1904 CR = XM_CRMASK_CR6FALSE; 1905 } 1906 *pCR = CR; 1907 return vResult; 1908#elif defined(_XM_SSE_INTRINSICS_) 1909 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); 1910 uint32_t CR = 0; 1911 int iTest = _mm_movemask_ps(vTemp); 1912 if (iTest==0xf) 1913 { 1914 CR = XM_CRMASK_CR6TRUE; 1915 } 1916 else if (!iTest) 1917 { 1918 // All elements are not greater 1919 CR = XM_CRMASK_CR6FALSE; 1920 } 1921 *pCR = CR; 1922 return vTemp; 1923#else // _XM_VMX128_INTRINSICS_ 1924#endif // _XM_VMX128_INTRINSICS_ 1925} 1926 1927//------------------------------------------------------------------------------ 1928 1929inline XMVECTOR XMVectorGreaterOrEqual 1930( 1931 FXMVECTOR V1, 1932 FXMVECTOR V2 1933) 1934{ 1935#if defined(_XM_NO_INTRINSICS_) 1936 1937 XMVECTOR Control; 1938 Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; 1939 Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; 1940 Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; 1941 Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; 1942 return Control; 1943 1944#elif defined(_XM_ARM_NEON_INTRINSICS_) 1945 return vcgeq_f32( V1, V2 ); 1946#elif defined(_XM_SSE_INTRINSICS_) 1947 return _mm_cmpge_ps( V1, V2 ); 1948#else // _XM_VMX128_INTRINSICS_ 1949#endif // _XM_VMX128_INTRINSICS_ 1950} 1951 1952//------------------------------------------------------------------------------ 1953 1954_Use_decl_annotations_ 1955inline XMVECTOR XMVectorGreaterOrEqualR 1956( 1957 uint32_t* pCR, 1958 FXMVECTOR V1, 1959 FXMVECTOR V2 1960) 1961{ 1962 assert( pCR != NULL ); 1963#if defined(_XM_NO_INTRINSICS_) 1964 1965 uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0; 1966 uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0; 1967 uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0; 1968 uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0; 1969 uint32_t CR = 0; 1970 if (ux&uy&uz&uw) 1971 { 1972 // All elements are greater 1973 CR = XM_CRMASK_CR6TRUE; 1974 } 1975 else if (!(ux|uy|uz|uw)) 1976 { 1977 // All elements are not greater 1978 CR = XM_CRMASK_CR6FALSE; 1979 } 1980 *pCR = CR; 1981 1982 XMVECTOR Control; 1983 Control.vector4_u32[0] = ux; 1984 Control.vector4_u32[1] = uy; 1985 Control.vector4_u32[2] = uz; 1986 Control.vector4_u32[3] = uw; 1987 return Control; 1988 1989#elif defined(_XM_ARM_NEON_INTRINSICS_) 1990 __n128 vResult = vcgeq_f32( V1, V2 ); 1991 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 1992 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 1993 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 1994 uint32_t CR = 0; 1995 if ( r == 0xFFFFFFFFU ) 1996 { 1997 // All elements are greater or equal 1998 CR = XM_CRMASK_CR6TRUE; 1999 } 2000 else if ( !r ) 2001 { 2002 // All elements are not greater or equal 2003 CR = XM_CRMASK_CR6FALSE; 2004 } 2005 *pCR = CR; 2006 return vResult; 2007#elif defined(_XM_SSE_INTRINSICS_) 2008 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); 2009 uint32_t CR = 0; 2010 int iTest = _mm_movemask_ps(vTemp); 2011 if (iTest==0xf) 2012 { 2013 CR = XM_CRMASK_CR6TRUE; 2014 } 2015 else if (!iTest) 2016 { 2017 // All elements are not greater 2018 CR = XM_CRMASK_CR6FALSE; 2019 } 2020 *pCR = CR; 2021 return vTemp; 2022#else // _XM_VMX128_INTRINSICS_ 2023#endif // _XM_VMX128_INTRINSICS_ 2024} 2025 2026//------------------------------------------------------------------------------ 2027 2028inline XMVECTOR XMVectorLess 2029( 2030 FXMVECTOR V1, 2031 FXMVECTOR V2 2032) 2033{ 2034#if defined(_XM_NO_INTRINSICS_) 2035 2036 XMVECTOR Control; 2037 Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; 2038 Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; 2039 Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; 2040 Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; 2041 return Control; 2042 2043#elif defined(_XM_ARM_NEON_INTRINSICS_) 2044 return vcltq_f32( V1, V2 ); 2045#elif defined(_XM_SSE_INTRINSICS_) 2046 return _mm_cmplt_ps( V1, V2 ); 2047#else // _XM_VMX128_INTRINSICS_ 2048#endif // _XM_VMX128_INTRINSICS_ 2049} 2050 2051//------------------------------------------------------------------------------ 2052 2053inline XMVECTOR XMVectorLessOrEqual 2054( 2055 FXMVECTOR V1, 2056 FXMVECTOR V2 2057) 2058{ 2059#if defined(_XM_NO_INTRINSICS_) 2060 2061 XMVECTOR Control; 2062 Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0; 2063 Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0; 2064 Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0; 2065 Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0; 2066 return Control; 2067 2068#elif defined(_XM_ARM_NEON_INTRINSICS_) 2069 return vcleq_f32( V1, V2 ); 2070#elif defined(_XM_SSE_INTRINSICS_) 2071 return _mm_cmple_ps( V1, V2 ); 2072#else // _XM_VMX128_INTRINSICS_ 2073#endif // _XM_VMX128_INTRINSICS_ 2074} 2075 2076//------------------------------------------------------------------------------ 2077 2078inline XMVECTOR XMVectorInBounds 2079( 2080 FXMVECTOR V, 2081 FXMVECTOR Bounds 2082) 2083{ 2084#if defined(_XM_NO_INTRINSICS_) 2085 2086 XMVECTOR Control; 2087 Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0; 2088 Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0; 2089 Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0; 2090 Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0; 2091 return Control; 2092 2093#elif defined(_XM_ARM_NEON_INTRINSICS_) 2094 // Test if less than or equal 2095 XMVECTOR vTemp1 = vcleq_f32(V,Bounds); 2096 // Negate the bounds 2097 XMVECTOR vTemp2 = vnegq_f32(Bounds); 2098 // Test if greater or equal (Reversed) 2099 vTemp2 = vcleq_f32(vTemp2,V); 2100 // Blend answers 2101 vTemp1 = vandq_u32(vTemp1,vTemp2); 2102 return vTemp1; 2103#elif defined(_XM_SSE_INTRINSICS_) 2104 // Test if less than or equal 2105 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); 2106 // Negate the bounds 2107 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); 2108 // Test if greater or equal (Reversed) 2109 vTemp2 = _mm_cmple_ps(vTemp2,V); 2110 // Blend answers 2111 vTemp1 = _mm_and_ps(vTemp1,vTemp2); 2112 return vTemp1; 2113#else // _XM_VMX128_INTRINSICS_ 2114#endif // _XM_VMX128_INTRINSICS_ 2115} 2116 2117//------------------------------------------------------------------------------ 2118 2119_Use_decl_annotations_ 2120inline XMVECTOR XMVectorInBoundsR 2121( 2122 uint32_t* pCR, 2123 FXMVECTOR V, 2124 FXMVECTOR Bounds 2125) 2126{ 2127 assert( pCR != NULL ); 2128#if defined(_XM_NO_INTRINSICS_) 2129 2130 uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0; 2131 uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0; 2132 uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0; 2133 uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0; 2134 2135 uint32_t CR = 0; 2136 if (ux&uy&uz&uw) 2137 { 2138 // All elements are in bounds 2139 CR = XM_CRMASK_CR6BOUNDS; 2140 } 2141 *pCR = CR; 2142 2143 XMVECTOR Control; 2144 Control.vector4_u32[0] = ux; 2145 Control.vector4_u32[1] = uy; 2146 Control.vector4_u32[2] = uz; 2147 Control.vector4_u32[3] = uw; 2148 return Control; 2149 2150#elif defined(_XM_ARM_NEON_INTRINSICS_) 2151 // Test if less than or equal 2152 XMVECTOR vTemp1 = vcleq_f32(V,Bounds); 2153 // Negate the bounds 2154 XMVECTOR vTemp2 = vnegq_f32(Bounds); 2155 // Test if greater or equal (Reversed) 2156 vTemp2 = vcleq_f32(vTemp2,V); 2157 // Blend answers 2158 vTemp1 = vandq_u32(vTemp1,vTemp2); 2159 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); 2160 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 2161 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 2162 uint32_t CR = 0; 2163 if ( r == 0xFFFFFFFFU ) 2164 { 2165 // All elements are in bounds 2166 CR = XM_CRMASK_CR6BOUNDS; 2167 } 2168 *pCR = CR; 2169 return vTemp1; 2170#elif defined(_XM_SSE_INTRINSICS_) 2171 // Test if less than or equal 2172 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); 2173 // Negate the bounds 2174 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); 2175 // Test if greater or equal (Reversed) 2176 vTemp2 = _mm_cmple_ps(vTemp2,V); 2177 // Blend answers 2178 vTemp1 = _mm_and_ps(vTemp1,vTemp2); 2179 2180 uint32_t CR = 0; 2181 if (_mm_movemask_ps(vTemp1)==0xf) { 2182 // All elements are in bounds 2183 CR = XM_CRMASK_CR6BOUNDS; 2184 } 2185 *pCR = CR; 2186 return vTemp1; 2187#else // _XM_VMX128_INTRINSICS_ 2188#endif // _XM_VMX128_INTRINSICS_ 2189} 2190 2191//------------------------------------------------------------------------------ 2192 2193inline XMVECTOR XMVectorIsNaN 2194( 2195 FXMVECTOR V 2196) 2197{ 2198#if defined(_XM_NO_INTRINSICS_) 2199 2200 XMVECTOR Control; 2201 Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; 2202 Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; 2203 Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; 2204 Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; 2205 return Control; 2206 2207#elif defined(_XM_ARM_NEON_INTRINSICS_) 2208 // Test against itself. NaN is always not equal 2209 __n128 vTempNan = vceqq_f32( V, V ); 2210 // Flip results 2211 return vmvnq_u32( vTempNan ); 2212#elif defined(_XM_SSE_INTRINSICS_) 2213 // Test against itself. NaN is always not equal 2214 return _mm_cmpneq_ps(V,V); 2215#else // _XM_VMX128_INTRINSICS_ 2216#endif // _XM_VMX128_INTRINSICS_ 2217} 2218 2219//------------------------------------------------------------------------------ 2220 2221inline XMVECTOR XMVectorIsInfinite 2222( 2223 FXMVECTOR V 2224) 2225{ 2226#if defined(_XM_NO_INTRINSICS_) 2227 2228 XMVECTOR Control; 2229 Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0; 2230 Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0; 2231 Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0; 2232 Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0; 2233 return Control; 2234 2235#elif defined(_XM_ARM_NEON_INTRINSICS_) 2236 // Mask off the sign bit 2237 __n128 vTemp = vandq_u32(V,g_XMAbsMask); 2238 // Compare to infinity 2239 vTemp = vceqq_f32(vTemp,g_XMInfinity); 2240 // If any are infinity, the signs are true. 2241 return vTemp; 2242#elif defined(_XM_SSE_INTRINSICS_) 2243 // Mask off the sign bit 2244 __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); 2245 // Compare to infinity 2246 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); 2247 // If any are infinity, the signs are true. 2248 return vTemp; 2249#else // _XM_VMX128_INTRINSICS_ 2250#endif // _XM_VMX128_INTRINSICS_ 2251} 2252 2253//------------------------------------------------------------------------------ 2254// Rounding and clamping operations 2255//------------------------------------------------------------------------------ 2256 2257//------------------------------------------------------------------------------ 2258 2259inline XMVECTOR XMVectorMin 2260( 2261 FXMVECTOR V1, 2262 FXMVECTOR V2 2263) 2264{ 2265#if defined(_XM_NO_INTRINSICS_) 2266 2267 XMVECTOR Result; 2268 Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; 2269 Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; 2270 Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; 2271 Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; 2272 return Result; 2273 2274#elif defined(_XM_ARM_NEON_INTRINSICS_) 2275 return vminq_f32( V1, V2 ); 2276#elif defined(_XM_SSE_INTRINSICS_) 2277 return _mm_min_ps( V1, V2 ); 2278#else // _XM_VMX128_INTRINSICS_ 2279#endif // _XM_VMX128_INTRINSICS_ 2280} 2281 2282//------------------------------------------------------------------------------ 2283 2284inline XMVECTOR XMVectorMax 2285( 2286 FXMVECTOR V1, 2287 FXMVECTOR V2 2288) 2289{ 2290#if defined(_XM_NO_INTRINSICS_) 2291 2292 XMVECTOR Result; 2293 Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0]; 2294 Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1]; 2295 Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2]; 2296 Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3]; 2297 return Result; 2298 2299#elif defined(_XM_ARM_NEON_INTRINSICS_) 2300 return vmaxq_f32( V1, V2 ); 2301#elif defined(_XM_SSE_INTRINSICS_) 2302 return _mm_max_ps( V1, V2 ); 2303#else // _XM_VMX128_INTRINSICS_ 2304#endif // _XM_VMX128_INTRINSICS_ 2305} 2306 2307//------------------------------------------------------------------------------ 2308 2309inline XMVECTOR XMVectorRound 2310( 2311 FXMVECTOR V 2312) 2313{ 2314#if defined(_XM_NO_INTRINSICS_) 2315 2316 const XMVECTOR Zero = XMVectorZero(); 2317 const XMVECTOR BiasPos = XMVectorReplicate(0.5f); 2318 const XMVECTOR BiasNeg = XMVectorReplicate(-0.5f); 2319 2320 XMVECTOR Bias = XMVectorLess(V, Zero); 2321 Bias = XMVectorSelect(BiasPos, BiasNeg, Bias); 2322 XMVECTOR Result = XMVectorAdd(V, Bias); 2323 Result = XMVectorTruncate(Result); 2324 2325 return Result; 2326 2327#elif defined(_XM_ARM_NEON_INTRINSICS_) 2328 __n128 vTest = vabsq_f32( V ); 2329 vTest = vcltq_f32( vTest, g_XMNoFraction ); 2330 2331 __n128 Bias = vcltq_f32( V, vdupq_n_u32(0) ); 2332 2333 __n128 BiasPos = vdupq_n_f32( 0.5f ); 2334 __n128 BiasNeg = vdupq_n_f32( -0.5f ); 2335 Bias = vbslq_f32( Bias, BiasNeg, BiasPos ); 2336 __n128 V0 = vaddq_f32( V, Bias ); 2337 __n128 vInt = vcvtq_s32_f32( V0 ); 2338 __n128 vResult = vcvtq_f32_s32( vInt ); 2339 2340 // All numbers less than 8388608 will use the round to int 2341 // All others, use the ORIGINAL value 2342 return vbslq_f32( vTest, vResult, V ); 2343#elif defined(_XM_SSE_INTRINSICS_) 2344 // To handle NAN, INF and numbers greater than 8388608, use masking 2345 // Get the abs value 2346 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); 2347 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF 2348 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); 2349 // Convert to int and back to float for rounding 2350 __m128i vInt = _mm_cvtps_epi32(V); 2351 // Convert back to floats 2352 XMVECTOR vResult = _mm_cvtepi32_ps(vInt); 2353 // All numbers less than 8388608 will use the round to int 2354 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2355 // All others, use the ORIGINAL value 2356 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); 2357 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2358 return vResult; 2359#else // _XM_VMX128_INTRINSICS_ 2360#endif // _XM_VMX128_INTRINSICS_ 2361} 2362 2363//------------------------------------------------------------------------------ 2364 2365inline XMVECTOR XMVectorTruncate 2366( 2367 FXMVECTOR V 2368) 2369{ 2370#if defined(_XM_NO_INTRINSICS_) 2371 XMVECTOR Result; 2372 uint32_t i; 2373 2374 // Avoid C4701 2375 Result.vector4_f32[0] = 0.0f; 2376 2377 for (i = 0; i < 4; i++) 2378 { 2379 if (XMISNAN(V.vector4_f32[i])) 2380 { 2381 Result.vector4_u32[i] = 0x7FC00000; 2382 } 2383 else if (fabsf(V.vector4_f32[i]) < 8388608.0f) 2384 { 2385 Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]); 2386 } 2387 else 2388 { 2389 Result.vector4_f32[i] = V.vector4_f32[i]; 2390 } 2391 } 2392 return Result; 2393 2394#elif defined(_XM_ARM_NEON_INTRINSICS_) 2395 __n128 vTest = vabsq_f32( V ); 2396 vTest = vcltq_f32( vTest, g_XMNoFraction ); 2397 2398 __n128 vInt = vcvtq_s32_f32( V ); 2399 __n128 vResult = vcvtq_f32_s32( vInt ); 2400 2401 // All numbers less than 8388608 will use the round to int 2402 // All others, use the ORIGINAL value 2403 return vbslq_f32( vTest, vResult, V ); 2404#elif defined(_XM_SSE_INTRINSICS_) 2405 // To handle NAN, INF and numbers greater than 8388608, use masking 2406 // Get the abs value 2407 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); 2408 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF 2409 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); 2410 // Convert to int and back to float for rounding with truncation 2411 __m128i vInt = _mm_cvttps_epi32(V); 2412 // Convert back to floats 2413 XMVECTOR vResult = _mm_cvtepi32_ps(vInt); 2414 // All numbers less than 8388608 will use the round to int 2415 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2416 // All others, use the ORIGINAL value 2417 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); 2418 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2419 return vResult; 2420#else // _XM_VMX128_INTRINSICS_ 2421#endif // _XM_VMX128_INTRINSICS_ 2422} 2423 2424//------------------------------------------------------------------------------ 2425 2426inline XMVECTOR XMVectorFloor 2427( 2428 FXMVECTOR V 2429) 2430{ 2431#if defined(_XM_NO_INTRINSICS_) 2432 2433 XMVECTOR vResult = { 2434 floorf(V.vector4_f32[0]), 2435 floorf(V.vector4_f32[1]), 2436 floorf(V.vector4_f32[2]), 2437 floorf(V.vector4_f32[3]) 2438 }; 2439 return vResult; 2440 2441#elif defined(_XM_ARM_NEON_INTRINSICS_) 2442 __n128 V0 = vsubq_f32( V, vdupq_n_u32(0x3EFFFFA0) ); 2443 return XMVectorRound(V0); 2444#elif defined(_XM_SSE_INTRINSICS_) 2445 // To handle NAN, INF and numbers greater than 8388608, use masking 2446 // Get the abs value 2447 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); 2448 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF 2449 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); 2450 // Convert to int and back to float for rounding 2451 XMVECTOR vResult = _mm_sub_ps(V,g_XMOneHalfMinusEpsilon); 2452 __m128i vInt = _mm_cvtps_epi32(vResult); 2453 // Convert back to floats 2454 vResult = _mm_cvtepi32_ps(vInt); 2455 // All numbers less than 8388608 will use the round to int 2456 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2457 // All others, use the ORIGINAL value 2458 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); 2459 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2460 return vResult; 2461#else // _XM_VMX128_INTRINSICS_ 2462#endif // _XM_VMX128_INTRINSICS_ 2463} 2464 2465//------------------------------------------------------------------------------ 2466 2467inline XMVECTOR XMVectorCeiling 2468( 2469 FXMVECTOR V 2470) 2471{ 2472#if defined(_XM_NO_INTRINSICS_) 2473 XMVECTOR vResult = { 2474 ceilf(V.vector4_f32[0]), 2475 ceilf(V.vector4_f32[1]), 2476 ceilf(V.vector4_f32[2]), 2477 ceilf(V.vector4_f32[3]) 2478 }; 2479 return vResult; 2480 2481#elif defined(_XM_ARM_NEON_INTRINSICS_) 2482 __n128 V0 = vaddq_f32( V, vdupq_n_u32(0x3EFFFFA0) ); 2483 return XMVectorRound(V0); 2484#elif defined(_XM_SSE_INTRINSICS_) 2485 // To handle NAN, INF and numbers greater than 8388608, use masking 2486 // Get the abs value 2487 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask); 2488 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF 2489 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction); 2490 // Convert to int and back to float for rounding 2491 XMVECTOR vResult = _mm_add_ps(V,g_XMOneHalfMinusEpsilon); 2492 __m128i vInt = _mm_cvtps_epi32(vResult); 2493 // Convert back to floats 2494 vResult = _mm_cvtepi32_ps(vInt); 2495 // All numbers less than 8388608 will use the round to int 2496 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2497 // All others, use the ORIGINAL value 2498 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V)); 2499 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]); 2500 return vResult; 2501#else // _XM_VMX128_INTRINSICS_ 2502#endif // _XM_VMX128_INTRINSICS_ 2503} 2504 2505//------------------------------------------------------------------------------ 2506 2507inline XMVECTOR XMVectorClamp 2508( 2509 FXMVECTOR V, 2510 FXMVECTOR Min, 2511 FXMVECTOR Max 2512) 2513{ 2514 assert(XMVector4LessOrEqual(Min, Max)); 2515 2516#if defined(_XM_NO_INTRINSICS_) 2517 2518 XMVECTOR Result; 2519 Result = XMVectorMax(Min, V); 2520 Result = XMVectorMin(Max, Result); 2521 return Result; 2522 2523#elif defined(_XM_ARM_NEON_INTRINSICS_) 2524 XMVECTOR vResult; 2525 vResult = vmaxq_f32(Min,V); 2526 vResult = vminq_f32(vResult,Max); 2527 return vResult; 2528#elif defined(_XM_SSE_INTRINSICS_) 2529 XMVECTOR vResult; 2530 vResult = _mm_max_ps(Min,V); 2531 vResult = _mm_min_ps(vResult,Max); 2532 return vResult; 2533#else // _XM_VMX128_INTRINSICS_ 2534#endif // _XM_VMX128_INTRINSICS_ 2535} 2536 2537//------------------------------------------------------------------------------ 2538 2539inline XMVECTOR XMVectorSaturate 2540( 2541 FXMVECTOR V 2542) 2543{ 2544#if defined(_XM_NO_INTRINSICS_) 2545 2546 const XMVECTOR Zero = XMVectorZero(); 2547 2548 return XMVectorClamp(V, Zero, g_XMOne.v); 2549 2550#elif defined(_XM_ARM_NEON_INTRINSICS_) 2551 // Set <0 to 0 2552 XMVECTOR vResult = vmaxq_f32(V, vdupq_n_u32(0) ); 2553 // Set>1 to 1 2554 return vminq_f32(vResult, vdupq_n_f32(1.0f) ); 2555#elif defined(_XM_SSE_INTRINSICS_) 2556 // Set <0 to 0 2557 XMVECTOR vResult = _mm_max_ps(V,g_XMZero); 2558 // Set>1 to 1 2559 return _mm_min_ps(vResult,g_XMOne); 2560#else // _XM_VMX128_INTRINSICS_ 2561#endif // _XM_VMX128_INTRINSICS_ 2562} 2563 2564//------------------------------------------------------------------------------ 2565// Bitwise logical operations 2566//------------------------------------------------------------------------------ 2567 2568inline XMVECTOR XMVectorAndInt 2569( 2570 FXMVECTOR V1, 2571 FXMVECTOR V2 2572) 2573{ 2574#if defined(_XM_NO_INTRINSICS_) 2575 2576 XMVECTOR Result; 2577 Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0]; 2578 Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1]; 2579 Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2]; 2580 Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3]; 2581 return Result; 2582 2583#elif defined(_XM_ARM_NEON_INTRINSICS_) 2584 return vandq_u32(V1,V2); 2585#elif defined(_XM_SSE_INTRINSICS_) 2586 return _mm_and_ps(V1,V2); 2587#else // _XM_VMX128_INTRINSICS_ 2588#endif // _XM_VMX128_INTRINSICS_ 2589} 2590 2591//------------------------------------------------------------------------------ 2592 2593inline XMVECTOR XMVectorAndCInt 2594( 2595 FXMVECTOR V1, 2596 FXMVECTOR V2 2597) 2598{ 2599#if defined(_XM_NO_INTRINSICS_) 2600 2601 XMVECTOR Result; 2602 Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0]; 2603 Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1]; 2604 Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2]; 2605 Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3]; 2606 return Result; 2607 2608#elif defined(_XM_ARM_NEON_INTRINSICS_) 2609 return vbicq_u32(V1,V2); 2610#elif defined(_XM_SSE_INTRINSICS_) 2611 __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) ); 2612 return reinterpret_cast<__m128 *>(&V)[0]; 2613#else // _XM_VMX128_INTRINSICS_ 2614#endif // _XM_VMX128_INTRINSICS_ 2615} 2616 2617//------------------------------------------------------------------------------ 2618 2619inline XMVECTOR XMVectorOrInt 2620( 2621 FXMVECTOR V1, 2622 FXMVECTOR V2 2623) 2624{ 2625#if defined(_XM_NO_INTRINSICS_) 2626 2627 XMVECTOR Result; 2628 Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0]; 2629 Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1]; 2630 Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2]; 2631 Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3]; 2632 return Result; 2633 2634#elif defined(_XM_ARM_NEON_INTRINSICS_) 2635 return vorrq_u32(V1,V2); 2636#elif defined(_XM_SSE_INTRINSICS_) 2637 __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); 2638 return reinterpret_cast<__m128 *>(&V)[0]; 2639#else // _XM_VMX128_INTRINSICS_ 2640#endif // _XM_VMX128_INTRINSICS_ 2641} 2642 2643//------------------------------------------------------------------------------ 2644 2645inline XMVECTOR XMVectorNorInt 2646( 2647 FXMVECTOR V1, 2648 FXMVECTOR V2 2649) 2650{ 2651#if defined(_XM_NO_INTRINSICS_) 2652 2653 XMVECTOR Result; 2654 Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]); 2655 Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]); 2656 Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]); 2657 Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]); 2658 return Result; 2659 2660#elif defined(_XM_ARM_NEON_INTRINSICS_) 2661 __n128 Result = vorrq_u32(V1,V2); 2662 return vbicq_u32(g_XMNegOneMask, Result); 2663#elif defined(_XM_SSE_INTRINSICS_) 2664 __m128i Result; 2665 Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); 2666 Result = _mm_andnot_si128( Result,g_XMNegOneMask); 2667 return reinterpret_cast<__m128 *>(&Result)[0]; 2668#else // _XM_VMX128_INTRINSICS_ 2669#endif // _XM_VMX128_INTRINSICS_ 2670} 2671 2672//------------------------------------------------------------------------------ 2673 2674inline XMVECTOR XMVectorXorInt 2675( 2676 FXMVECTOR V1, 2677 FXMVECTOR V2 2678) 2679{ 2680#if defined(_XM_NO_INTRINSICS_) 2681 2682 XMVECTOR Result; 2683 Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0]; 2684 Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1]; 2685 Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2]; 2686 Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3]; 2687 return Result; 2688 2689#elif defined(_XM_ARM_NEON_INTRINSICS_) 2690 return veorq_u32(V1,V2); 2691#elif defined(_XM_SSE_INTRINSICS_) 2692 __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) ); 2693 return reinterpret_cast<__m128 *>(&V)[0]; 2694#else // _XM_VMX128_INTRINSICS_ 2695#endif // _XM_VMX128_INTRINSICS_ 2696} 2697 2698//------------------------------------------------------------------------------ 2699// Computation operations 2700//------------------------------------------------------------------------------ 2701 2702//------------------------------------------------------------------------------ 2703 2704inline XMVECTOR XMVectorNegate 2705( 2706 FXMVECTOR V 2707) 2708{ 2709#if defined(_XM_NO_INTRINSICS_) 2710 2711 XMVECTOR Result; 2712 Result.vector4_f32[0] = -V.vector4_f32[0]; 2713 Result.vector4_f32[1] = -V.vector4_f32[1]; 2714 Result.vector4_f32[2] = -V.vector4_f32[2]; 2715 Result.vector4_f32[3] = -V.vector4_f32[3]; 2716 return Result; 2717 2718#elif defined(_XM_ARM_NEON_INTRINSICS_) 2719 return vnegq_f32(V); 2720#elif defined(_XM_SSE_INTRINSICS_) 2721 XMVECTOR Z; 2722 2723 Z = _mm_setzero_ps(); 2724 2725 return _mm_sub_ps( Z, V ); 2726#else // _XM_VMX128_INTRINSICS_ 2727#endif // _XM_VMX128_INTRINSICS_ 2728} 2729 2730//------------------------------------------------------------------------------ 2731 2732inline XMVECTOR XMVectorAdd 2733( 2734 FXMVECTOR V1, 2735 FXMVECTOR V2 2736) 2737{ 2738#if defined(_XM_NO_INTRINSICS_) 2739 2740 XMVECTOR Result; 2741 Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0]; 2742 Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1]; 2743 Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2]; 2744 Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3]; 2745 return Result; 2746 2747#elif defined(_XM_ARM_NEON_INTRINSICS_) 2748 return vaddq_f32( V1, V2 ); 2749#elif defined(_XM_SSE_INTRINSICS_) 2750 return _mm_add_ps( V1, V2 ); 2751#else // _XM_VMX128_INTRINSICS_ 2752#endif // _XM_VMX128_INTRINSICS_ 2753} 2754 2755//------------------------------------------------------------------------------ 2756 2757inline XMVECTOR XMVectorAddAngles 2758( 2759 FXMVECTOR V1, 2760 FXMVECTOR V2 2761) 2762{ 2763#if defined(_XM_NO_INTRINSICS_) 2764 2765 const XMVECTOR Zero = XMVectorZero(); 2766 2767 // Add the given angles together. If the range of V1 is such 2768 // that -Pi <= V1 < Pi and the range of V2 is such that 2769 // -2Pi <= V2 <= 2Pi, then the range of the resulting angle 2770 // will be -Pi <= Result < Pi. 2771 XMVECTOR Result = XMVectorAdd(V1, V2); 2772 2773 XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); 2774 XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); 2775 2776 Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); 2777 Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); 2778 2779 Result = XMVectorAdd(Result, Offset); 2780 2781 return Result; 2782 2783#elif defined(_XM_ARM_NEON_INTRINSICS_) 2784 // Adjust the angles 2785 __n128 vResult = vaddq_f32(V1,V2); 2786 // Less than Pi? 2787 __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi); 2788 vOffset = vandq_u32(vOffset,g_XMTwoPi); 2789 // Add 2Pi to all entries less than -Pi 2790 vResult = vaddq_f32(vResult,vOffset); 2791 // Greater than or equal to Pi? 2792 vOffset = vcgeq_f32(vResult,g_XMPi); 2793 vOffset = vandq_u32(vOffset,g_XMTwoPi); 2794 // Sub 2Pi to all entries greater than Pi 2795 vResult = vsubq_f32(vResult,vOffset); 2796 return vResult; 2797#elif defined(_XM_SSE_INTRINSICS_) 2798 // Adjust the angles 2799 XMVECTOR vResult = _mm_add_ps(V1,V2); 2800 // Less than Pi? 2801 XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); 2802 vOffset = _mm_and_ps(vOffset,g_XMTwoPi); 2803 // Add 2Pi to all entries less than -Pi 2804 vResult = _mm_add_ps(vResult,vOffset); 2805 // Greater than or equal to Pi? 2806 vOffset = _mm_cmpge_ps(vResult,g_XMPi); 2807 vOffset = _mm_and_ps(vOffset,g_XMTwoPi); 2808 // Sub 2Pi to all entries greater than Pi 2809 vResult = _mm_sub_ps(vResult,vOffset); 2810 return vResult; 2811#else // _XM_VMX128_INTRINSICS_ 2812#endif // _XM_VMX128_INTRINSICS_ 2813} 2814 2815//------------------------------------------------------------------------------ 2816 2817inline XMVECTOR XMVectorSubtract 2818( 2819 FXMVECTOR V1, 2820 FXMVECTOR V2 2821) 2822{ 2823#if defined(_XM_NO_INTRINSICS_) 2824 2825 XMVECTOR Result; 2826 Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0]; 2827 Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1]; 2828 Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2]; 2829 Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3]; 2830 return Result; 2831 2832#elif defined(_XM_ARM_NEON_INTRINSICS_) 2833 return vsubq_f32( V1, V2 ); 2834#elif defined(_XM_SSE_INTRINSICS_) 2835 return _mm_sub_ps( V1, V2 ); 2836#else // _XM_VMX128_INTRINSICS_ 2837#endif // _XM_VMX128_INTRINSICS_ 2838} 2839 2840//------------------------------------------------------------------------------ 2841 2842inline XMVECTOR XMVectorSubtractAngles 2843( 2844 FXMVECTOR V1, 2845 FXMVECTOR V2 2846) 2847{ 2848#if defined(_XM_NO_INTRINSICS_) 2849 2850 const XMVECTOR Zero = XMVectorZero(); 2851 2852 // Subtract the given angles. If the range of V1 is such 2853 // that -Pi <= V1 < Pi and the range of V2 is such that 2854 // -2Pi <= V2 <= 2Pi, then the range of the resulting angle 2855 // will be -Pi <= Result < Pi. 2856 XMVECTOR Result = XMVectorSubtract(V1, V2); 2857 2858 XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v); 2859 XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask); 2860 2861 Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v); 2862 Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask); 2863 2864 Result = XMVectorAdd(Result, Offset); 2865 2866 return Result; 2867 2868#elif defined(_XM_ARM_NEON_INTRINSICS_) 2869 // Adjust the angles 2870 __n128 vResult = vsubq_f32(V1,V2); 2871 // Less than Pi? 2872 __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi); 2873 vOffset = vandq_u32(vOffset,g_XMTwoPi); 2874 // Add 2Pi to all entries less than -Pi 2875 vResult = vaddq_f32(vResult,vOffset); 2876 // Greater than or equal to Pi? 2877 vOffset = vcgeq_f32(vResult,g_XMPi); 2878 vOffset = vandq_u32(vOffset,g_XMTwoPi); 2879 // Sub 2Pi to all entries greater than Pi 2880 vResult = vsubq_f32(vResult,vOffset); 2881 return vResult; 2882#elif defined(_XM_SSE_INTRINSICS_) 2883 // Adjust the angles 2884 XMVECTOR vResult = _mm_sub_ps(V1,V2); 2885 // Less than Pi? 2886 XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi); 2887 vOffset = _mm_and_ps(vOffset,g_XMTwoPi); 2888 // Add 2Pi to all entries less than -Pi 2889 vResult = _mm_add_ps(vResult,vOffset); 2890 // Greater than or equal to Pi? 2891 vOffset = _mm_cmpge_ps(vResult,g_XMPi); 2892 vOffset = _mm_and_ps(vOffset,g_XMTwoPi); 2893 // Sub 2Pi to all entries greater than Pi 2894 vResult = _mm_sub_ps(vResult,vOffset); 2895 return vResult; 2896#else // _XM_VMX128_INTRINSICS_ 2897#endif // _XM_VMX128_INTRINSICS_ 2898} 2899 2900//------------------------------------------------------------------------------ 2901 2902inline XMVECTOR XMVectorMultiply 2903( 2904 FXMVECTOR V1, 2905 FXMVECTOR V2 2906) 2907{ 2908#if defined(_XM_NO_INTRINSICS_) 2909 XMVECTOR Result = { 2910 V1.vector4_f32[0] * V2.vector4_f32[0], 2911 V1.vector4_f32[1] * V2.vector4_f32[1], 2912 V1.vector4_f32[2] * V2.vector4_f32[2], 2913 V1.vector4_f32[3] * V2.vector4_f32[3] 2914 }; 2915 return Result; 2916#elif defined(_XM_ARM_NEON_INTRINSICS_) 2917 return vmulq_f32( V1, V2 ); 2918#elif defined(_XM_SSE_INTRINSICS_) 2919 return _mm_mul_ps( V1, V2 ); 2920#else // _XM_VMX128_INTRINSICS_ 2921#endif // _XM_VMX128_INTRINSICS_ 2922} 2923 2924//------------------------------------------------------------------------------ 2925 2926inline XMVECTOR XMVectorMultiplyAdd 2927( 2928 FXMVECTOR V1, 2929 FXMVECTOR V2, 2930 FXMVECTOR V3 2931) 2932{ 2933#if defined(_XM_NO_INTRINSICS_) 2934 XMVECTOR vResult = { 2935 (V1.vector4_f32[0] * V2.vector4_f32[0]) + V3.vector4_f32[0], 2936 (V1.vector4_f32[1] * V2.vector4_f32[1]) + V3.vector4_f32[1], 2937 (V1.vector4_f32[2] * V2.vector4_f32[2]) + V3.vector4_f32[2], 2938 (V1.vector4_f32[3] * V2.vector4_f32[3]) + V3.vector4_f32[3] 2939 }; 2940 return vResult; 2941 2942#elif defined(_XM_ARM_NEON_INTRINSICS_) 2943 return vmlaq_f32( V3, V1, V2 ); 2944#elif defined(_XM_SSE_INTRINSICS_) 2945 XMVECTOR vResult = _mm_mul_ps( V1, V2 ); 2946 return _mm_add_ps(vResult, V3 ); 2947#else // _XM_VMX128_INTRINSICS_ 2948#endif // _XM_VMX128_INTRINSICS_ 2949} 2950 2951//------------------------------------------------------------------------------ 2952 2953inline XMVECTOR XMVectorDivide 2954( 2955 FXMVECTOR V1, 2956 FXMVECTOR V2 2957) 2958{ 2959#if defined(_XM_NO_INTRINSICS_) 2960 XMVECTOR Result; 2961 Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0]; 2962 Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1]; 2963 Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2]; 2964 Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3]; 2965 return Result; 2966#elif defined(_XM_ARM_NEON_INTRINSICS_) 2967 // 2 iterations of Newton-Raphson refinement of reciprocal 2968 __n128 Reciprocal = vrecpeq_f32(V2); 2969 __n128 S = vrecpsq_f32( Reciprocal, V2 ); 2970 Reciprocal = vmulq_f32( S, Reciprocal ); 2971 S = vrecpsq_f32( Reciprocal, V2 ); 2972 Reciprocal = vmulq_f32( S, Reciprocal ); 2973 return vmulq_f32( V1, Reciprocal ); 2974#elif defined(_XM_SSE_INTRINSICS_) 2975 return _mm_div_ps( V1, V2 ); 2976#else // _XM_VMX128_INTRINSICS_ 2977#endif // _XM_VMX128_INTRINSICS_ 2978} 2979 2980//------------------------------------------------------------------------------ 2981 2982inline XMVECTOR XMVectorNegativeMultiplySubtract 2983( 2984 FXMVECTOR V1, 2985 FXMVECTOR V2, 2986 FXMVECTOR V3 2987) 2988{ 2989#if defined(_XM_NO_INTRINSICS_) 2990 2991 XMVECTOR vResult = { 2992 V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]), 2993 V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]), 2994 V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]), 2995 V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3]) 2996 }; 2997 return vResult; 2998 2999#elif defined(_XM_ARM_NEON_INTRINSICS_) 3000 return vmlsq_f32( V3, V1, V2 ); 3001#elif defined(_XM_SSE_INTRINSICS_) 3002 XMVECTOR R = _mm_mul_ps( V1, V2 ); 3003 return _mm_sub_ps( V3, R ); 3004#else // _XM_VMX128_INTRINSICS_ 3005#endif // _XM_VMX128_INTRINSICS_ 3006} 3007 3008//------------------------------------------------------------------------------ 3009 3010inline XMVECTOR XMVectorScale 3011( 3012 FXMVECTOR V, 3013 float ScaleFactor 3014) 3015{ 3016#if defined(_XM_NO_INTRINSICS_) 3017 XMVECTOR vResult = { 3018 V.vector4_f32[0] * ScaleFactor, 3019 V.vector4_f32[1] * ScaleFactor, 3020 V.vector4_f32[2] * ScaleFactor, 3021 V.vector4_f32[3] * ScaleFactor 3022 }; 3023 return vResult; 3024 3025#elif defined(_XM_ARM_NEON_INTRINSICS_) 3026 return vmulq_n_f32( V, ScaleFactor ); 3027#elif defined(_XM_SSE_INTRINSICS_) 3028 XMVECTOR vResult = _mm_set_ps1(ScaleFactor); 3029 return _mm_mul_ps(vResult,V); 3030#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 3031#endif // _XM_VMX128_INTRINSICS_ 3032} 3033 3034//------------------------------------------------------------------------------ 3035 3036inline XMVECTOR XMVectorReciprocalEst 3037( 3038 FXMVECTOR V 3039) 3040{ 3041#if defined(_XM_NO_INTRINSICS_) 3042 XMVECTOR Result; 3043 Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; 3044 Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; 3045 Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; 3046 Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; 3047 return Result; 3048#elif defined(_XM_ARM_NEON_INTRINSICS_) 3049 return vrecpeq_f32(V); 3050#elif defined(_XM_SSE_INTRINSICS_) 3051 return _mm_rcp_ps(V); 3052#else // _XM_VMX128_INTRINSICS_ 3053#endif // _XM_VMX128_INTRINSICS_ 3054} 3055 3056//------------------------------------------------------------------------------ 3057 3058inline XMVECTOR XMVectorReciprocal 3059( 3060 FXMVECTOR V 3061) 3062{ 3063#if defined(_XM_NO_INTRINSICS_) 3064 XMVECTOR Result; 3065 Result.vector4_f32[0] = 1.f / V.vector4_f32[0]; 3066 Result.vector4_f32[1] = 1.f / V.vector4_f32[1]; 3067 Result.vector4_f32[2] = 1.f / V.vector4_f32[2]; 3068 Result.vector4_f32[3] = 1.f / V.vector4_f32[3]; 3069 return Result; 3070#elif defined(_XM_ARM_NEON_INTRINSICS_) 3071 // 2 iterations of Newton-Raphson refinement 3072 __n128 Reciprocal = vrecpeq_f32(V); 3073 __n128 S = vrecpsq_f32( Reciprocal, V ); 3074 Reciprocal = vmulq_f32( S, Reciprocal ); 3075 S = vrecpsq_f32( Reciprocal, V ); 3076 return vmulq_f32( S, Reciprocal ); 3077#elif defined(_XM_SSE_INTRINSICS_) 3078 return _mm_div_ps(g_XMOne,V); 3079#else // _XM_VMX128_INTRINSICS_ 3080#endif // _XM_VMX128_INTRINSICS_ 3081} 3082 3083//------------------------------------------------------------------------------ 3084// Return an estimated square root 3085inline XMVECTOR XMVectorSqrtEst 3086( 3087 FXMVECTOR V 3088) 3089{ 3090#if defined(_XM_NO_INTRINSICS_) 3091 XMVECTOR Result; 3092 Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); 3093 Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); 3094 Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); 3095 Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); 3096 return Result; 3097#elif defined(_XM_ARM_NEON_INTRINSICS_) 3098 // 1 iteration of Newton-Raphson refinment of sqrt 3099 __n128 S0 = vrsqrteq_f32(V); 3100 __n128 P0 = vmulq_f32( V, S0 ); 3101 __n128 R0 = vrsqrtsq_f32( P0, S0 ); 3102 __n128 S1 = vmulq_f32( S0, R0 ); 3103 3104 XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); 3105 XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); 3106 __n128 Result = vmulq_f32( V, S1 ); 3107 XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); 3108 return XMVectorSelect(V, Result, Select); 3109#elif defined(_XM_SSE_INTRINSICS_) 3110 return _mm_sqrt_ps(V); 3111#else // _XM_VMX128_INTRINSICS_ 3112#endif // _XM_VMX128_INTRINSICS_ 3113} 3114 3115//------------------------------------------------------------------------------ 3116 3117inline XMVECTOR XMVectorSqrt 3118( 3119 FXMVECTOR V 3120) 3121{ 3122#if defined(_XM_NO_INTRINSICS_) 3123 XMVECTOR Result; 3124 Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] ); 3125 Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] ); 3126 Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] ); 3127 Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] ); 3128 return Result; 3129#elif defined(_XM_ARM_NEON_INTRINSICS_) 3130 // 3 iterations of Newton-Raphson refinment of sqrt 3131 __n128 S0 = vrsqrteq_f32(V); 3132 __n128 P0 = vmulq_f32( V, S0 ); 3133 __n128 R0 = vrsqrtsq_f32( P0, S0 ); 3134 __n128 S1 = vmulq_f32( S0, R0 ); 3135 __n128 P1 = vmulq_f32( V, S1 ); 3136 __n128 R1 = vrsqrtsq_f32( P1, S1 ); 3137 __n128 S2 = vmulq_f32( S1, R1 ); 3138 __n128 P2 = vmulq_f32( V, S2 ); 3139 __n128 R2 = vrsqrtsq_f32( P2, S2 ); 3140 __n128 S3 = vmulq_f32( S2, R2 ); 3141 3142 XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v); 3143 XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) ); 3144 __n128 Result = vmulq_f32( V, S3 ); 3145 XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero); 3146 return XMVectorSelect(V, Result, Select); 3147#elif defined(_XM_SSE_INTRINSICS_) 3148 return _mm_sqrt_ps(V); 3149#else // _XM_VMX128_INTRINSICS_ 3150#endif // _XM_VMX128_INTRINSICS_ 3151} 3152 3153//------------------------------------------------------------------------------ 3154 3155inline XMVECTOR XMVectorReciprocalSqrtEst 3156( 3157 FXMVECTOR V 3158) 3159{ 3160#if defined(_XM_NO_INTRINSICS_) 3161 XMVECTOR Result; 3162 Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); 3163 Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); 3164 Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); 3165 Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); 3166 return Result; 3167#elif defined(_XM_ARM_NEON_INTRINSICS_) 3168 return vrsqrteq_f32(V); 3169#elif defined(_XM_SSE_INTRINSICS_) 3170 return _mm_rsqrt_ps(V); 3171#else // _XM_VMX128_INTRINSICS_ 3172#endif // _XM_VMX128_INTRINSICS_ 3173} 3174 3175//------------------------------------------------------------------------------ 3176 3177inline XMVECTOR XMVectorReciprocalSqrt 3178( 3179 FXMVECTOR V 3180) 3181{ 3182#if defined(_XM_NO_INTRINSICS_) 3183 XMVECTOR Result; 3184 Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] ); 3185 Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] ); 3186 Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] ); 3187 Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] ); 3188 return Result; 3189#elif defined(_XM_ARM_NEON_INTRINSICS_) 3190 // 2 iterations of Newton-Raphson refinement of reciprocal 3191 __n128 S0 = vrsqrteq_f32(V); 3192 3193 __n128 P0 = vmulq_f32( V, S0 ); 3194 __n128 R0 = vrsqrtsq_f32( P0, S0 ); 3195 3196 __n128 S1 = vmulq_f32( S0, R0 ); 3197 __n128 P1 = vmulq_f32( V, S1 ); 3198 __n128 R1 = vrsqrtsq_f32( P1, S1 ); 3199 3200 return vmulq_f32( S1, R1 ); 3201#elif defined(_XM_SSE_INTRINSICS_) 3202 XMVECTOR vResult = _mm_sqrt_ps(V); 3203 vResult = _mm_div_ps(g_XMOne,vResult); 3204 return vResult; 3205#else // _XM_VMX128_INTRINSICS_ 3206#endif // _XM_VMX128_INTRINSICS_ 3207} 3208 3209 3210//------------------------------------------------------------------------------ 3211 3212inline XMVECTOR XMVectorExp 3213( 3214 FXMVECTOR V 3215) 3216{ 3217#if defined(_XM_NO_INTRINSICS_) 3218 3219 XMVECTOR Result; 3220 Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]); 3221 Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]); 3222 Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]); 3223 Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]); 3224 return Result; 3225 3226#elif defined(_XM_ARM_NEON_INTRINSICS_) 3227 XMVECTORF32 vResult = { 3228 powf(2.0f,vgetq_lane_f32(V, 0)), 3229 powf(2.0f,vgetq_lane_f32(V, 1)), 3230 powf(2.0f,vgetq_lane_f32(V, 2)), 3231 powf(2.0f,vgetq_lane_f32(V, 3)) 3232 }; 3233 return vResult; 3234#elif defined(_XM_SSE_INTRINSICS_) 3235 __declspec(align(16)) float a[4]; 3236 _mm_store_ps( a, V ); 3237 XMVECTOR vResult = _mm_setr_ps( 3238 powf(2.0f,a[0]), 3239 powf(2.0f,a[1]), 3240 powf(2.0f,a[2]), 3241 powf(2.0f,a[3])); 3242 return vResult; 3243#else // _XM_VMX128_INTRINSICS_ 3244#endif // _XM_VMX128_INTRINSICS_ 3245} 3246 3247 3248//------------------------------------------------------------------------------ 3249 3250inline XMVECTOR XMVectorLog 3251( 3252 FXMVECTOR V 3253) 3254{ 3255#if defined(_XM_NO_INTRINSICS_) 3256 3257 const float fScale = 1.4426950f; // (1.0f / logf(2.0f)); 3258 3259 XMVECTOR Result; 3260 Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale; 3261 Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale; 3262 Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale; 3263 Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale; 3264 return Result; 3265 3266#elif defined(_XM_ARM_NEON_INTRINSICS_) 3267 XMVECTOR vScale = vdupq_n_f32(1.0f / logf(2.0f)); 3268 XMVECTORF32 vResult = { 3269 logf(vgetq_lane_f32(V, 0)), 3270 logf(vgetq_lane_f32(V, 1)), 3271 logf(vgetq_lane_f32(V, 2)), 3272 logf(vgetq_lane_f32(V, 3)) 3273 }; 3274 return vmulq_f32( vResult, vScale ); 3275#elif defined(_XM_SSE_INTRINSICS_) 3276 __declspec(align(16)) float a[4]; 3277 _mm_store_ps( a, V ); 3278 XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f)); 3279 XMVECTOR vResult = _mm_setr_ps( 3280 logf(a[0]), 3281 logf(a[1]), 3282 logf(a[2]), 3283 logf(a[3])); 3284 vResult = _mm_mul_ps(vResult,vScale); 3285 return vResult; 3286#else // _XM_VMX128_INTRINSICS_ 3287#endif // _XM_VMX128_INTRINSICS_ 3288} 3289 3290 3291//------------------------------------------------------------------------------ 3292 3293inline XMVECTOR XMVectorPow 3294( 3295 FXMVECTOR V1, 3296 FXMVECTOR V2 3297) 3298{ 3299#if defined(_XM_NO_INTRINSICS_) 3300 3301 XMVECTOR Result; 3302 Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]); 3303 Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]); 3304 Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]); 3305 Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]); 3306 return Result; 3307 3308#elif defined(_XM_ARM_NEON_INTRINSICS_) 3309 XMVECTORF32 vResult = { 3310 powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)), 3311 powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)), 3312 powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)), 3313 powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3)) 3314 }; 3315 return vResult; 3316#elif defined(_XM_SSE_INTRINSICS_) 3317 __declspec(align(16)) float a[4]; 3318 __declspec(align(16)) float b[4]; 3319 _mm_store_ps( a, V1 ); 3320 _mm_store_ps( b, V2 ); 3321 XMVECTOR vResult = _mm_setr_ps( 3322 powf(a[0],b[0]), 3323 powf(a[1],b[1]), 3324 powf(a[2],b[2]), 3325 powf(a[3],b[3])); 3326 return vResult; 3327#else // _XM_VMX128_INTRINSICS_ 3328#endif // _XM_VMX128_INTRINSICS_ 3329} 3330 3331//------------------------------------------------------------------------------ 3332 3333inline XMVECTOR XMVectorAbs 3334( 3335 FXMVECTOR V 3336) 3337{ 3338#if defined(_XM_NO_INTRINSICS_) 3339 XMVECTOR vResult = { 3340 fabsf(V.vector4_f32[0]), 3341 fabsf(V.vector4_f32[1]), 3342 fabsf(V.vector4_f32[2]), 3343 fabsf(V.vector4_f32[3]) 3344 }; 3345 return vResult; 3346 3347#elif defined(_XM_ARM_NEON_INTRINSICS_) 3348 return vabsq_f32( V ); 3349#elif defined(_XM_SSE_INTRINSICS_) 3350 XMVECTOR vResult = _mm_setzero_ps(); 3351 vResult = _mm_sub_ps(vResult,V); 3352 vResult = _mm_max_ps(vResult,V); 3353 return vResult; 3354#else // _XM_VMX128_INTRINSICS_ 3355#endif // _XM_VMX128_INTRINSICS_ 3356} 3357 3358//------------------------------------------------------------------------------ 3359 3360inline XMVECTOR XMVectorMod 3361( 3362 FXMVECTOR V1, 3363 FXMVECTOR V2 3364) 3365{ 3366 // V1 % V2 = V1 - V2 * truncate(V1 / V2) 3367 3368#if defined(_XM_NO_INTRINSICS_) 3369 3370 XMVECTOR Quotient = XMVectorDivide(V1, V2); 3371 Quotient = XMVectorTruncate(Quotient); 3372 XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1); 3373 return Result; 3374 3375#elif defined(_XM_ARM_NEON_INTRINSICS_) 3376 XMVECTOR vResult = XMVectorDivide(V1, V2); 3377 vResult = XMVectorTruncate(vResult); 3378 return vmlsq_f32( V1, vResult, V2 ); 3379#elif defined(_XM_SSE_INTRINSICS_) 3380 XMVECTOR vResult = _mm_div_ps(V1, V2); 3381 vResult = XMVectorTruncate(vResult); 3382 vResult = _mm_mul_ps(vResult,V2); 3383 vResult = _mm_sub_ps(V1,vResult); 3384 return vResult; 3385#else // _XM_VMX128_INTRINSICS_ 3386#endif // _XM_VMX128_INTRINSICS_ 3387} 3388 3389//------------------------------------------------------------------------------ 3390 3391inline XMVECTOR XMVectorModAngles 3392( 3393 FXMVECTOR Angles 3394) 3395{ 3396#if defined(_XM_NO_INTRINSICS_) 3397 3398 XMVECTOR V; 3399 XMVECTOR Result; 3400 3401 // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI 3402 V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v); 3403 V = XMVectorRound(V); 3404 Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles); 3405 return Result; 3406 3407#elif defined(_XM_ARM_NEON_INTRINSICS_) 3408 // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI 3409 XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi); 3410 // Use the inline function due to complexity for rounding 3411 vResult = XMVectorRound(vResult); 3412 return vmlsq_f32( Angles, vResult, g_XMTwoPi ); 3413#elif defined(_XM_SSE_INTRINSICS_) 3414 // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI 3415 XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi); 3416 // Use the inline function due to complexity for rounding 3417 vResult = XMVectorRound(vResult); 3418 vResult = _mm_mul_ps(vResult,g_XMTwoPi); 3419 vResult = _mm_sub_ps(Angles,vResult); 3420 return vResult; 3421#else // _XM_VMX128_INTRINSICS_ 3422#endif // _XM_VMX128_INTRINSICS_ 3423} 3424 3425//------------------------------------------------------------------------------ 3426 3427inline XMVECTOR XMVectorSin 3428( 3429 FXMVECTOR V 3430) 3431{ 3432 // 11-degree minimax approximation 3433 3434#if defined(_XM_NO_INTRINSICS_) 3435 XMVECTOR Result; 3436 Result.vector4_f32[0] = XMScalarSin( V.vector4_f32[0] ); 3437 Result.vector4_f32[1] = XMScalarSin( V.vector4_f32[1] ); 3438 Result.vector4_f32[2] = XMScalarSin( V.vector4_f32[2] ); 3439 Result.vector4_f32[3] = XMScalarSin( V.vector4_f32[3] ); 3440 return Result; 3441#elif defined(_XM_ARM_NEON_INTRINSICS_) 3442 // Force the value within the bounds of pi 3443 XMVECTOR x = XMVectorModAngles(V); 3444 3445 // Map in [-pi/2,pi/2] with sin(y) = sin(x). 3446 __n128 sign = vandq_u32(x, g_XMNegativeZero); 3447 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 3448 __n128 absx = vabsq_f32( x ); 3449 __n128 rflx = vsubq_f32(c, x); 3450 __n128 comp = vcleq_f32(absx, g_XMHalfPi); 3451 x = vbslq_f32( comp, x, rflx ); 3452 3453 __n128 x2 = vmulq_f32(x, x); 3454 3455 // Compute polynomial approximation 3456 const XMVECTOR SC1 = g_XMSinCoefficients1; 3457 XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0); 3458 3459 const XMVECTOR SC0 = g_XMSinCoefficients0; 3460 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); 3461 Result = vmlaq_f32(vConstants, Result, x2); 3462 3463 vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); 3464 Result = vmlaq_f32(vConstants, Result, x2); 3465 3466 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); 3467 Result = vmlaq_f32(vConstants, Result, x2); 3468 3469 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); 3470 Result = vmlaq_f32(vConstants, Result, x2); 3471 3472 Result = vmlaq_f32(g_XMOne, Result, x2); 3473 Result = vmulq_f32(Result, x); 3474 return Result; 3475#elif defined(_XM_SSE_INTRINSICS_) 3476 // Force the value within the bounds of pi 3477 XMVECTOR x = XMVectorModAngles(V); 3478 3479 // Map in [-pi/2,pi/2] with sin(y) = sin(x). 3480 __m128 sign = _mm_and_ps(x, g_XMNegativeZero); 3481 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 3482 __m128 absx = _mm_andnot_ps(sign, x); // |x| 3483 __m128 rflx = _mm_sub_ps(c, x); 3484 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); 3485 __m128 select0 = _mm_and_ps(comp, x); 3486 __m128 select1 = _mm_andnot_ps(comp, rflx); 3487 x = _mm_or_ps(select0, select1); 3488 3489 __m128 x2 = _mm_mul_ps(x, x); 3490 3491 // Compute polynomial approximation 3492 const XMVECTOR SC1 = g_XMSinCoefficients1; 3493 XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); 3494 __m128 Result = _mm_mul_ps(vConstants, x2); 3495 3496 const XMVECTOR SC0 = g_XMSinCoefficients0; 3497 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); 3498 Result = _mm_add_ps(Result, vConstants); 3499 Result = _mm_mul_ps(Result, x2); 3500 3501 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); 3502 Result = _mm_add_ps(Result, vConstants); 3503 Result = _mm_mul_ps(Result, x2); 3504 3505 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); 3506 Result = _mm_add_ps(Result, vConstants); 3507 Result = _mm_mul_ps(Result, x2); 3508 3509 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); 3510 Result = _mm_add_ps(Result, vConstants); 3511 Result = _mm_mul_ps(Result, x2); 3512 Result = _mm_add_ps(Result, g_XMOne); 3513 Result = _mm_mul_ps(Result, x); 3514 return Result; 3515#else // _XM_VMX128_INTRINSICS_ 3516#endif // _XM_VMX128_INTRINSICS_ 3517} 3518 3519//------------------------------------------------------------------------------ 3520 3521inline XMVECTOR XMVectorCos 3522( 3523 FXMVECTOR V 3524) 3525{ 3526 // 10-degree minimax approximation 3527 3528#if defined(_XM_NO_INTRINSICS_) 3529 XMVECTOR Result; 3530 Result.vector4_f32[0] = XMScalarCos( V.vector4_f32[0] ); 3531 Result.vector4_f32[1] = XMScalarCos( V.vector4_f32[1] ); 3532 Result.vector4_f32[2] = XMScalarCos( V.vector4_f32[2] ); 3533 Result.vector4_f32[3] = XMScalarCos( V.vector4_f32[3] ); 3534 return Result; 3535#elif defined(_XM_ARM_NEON_INTRINSICS_) 3536 // Map V to x in [-pi,pi]. 3537 XMVECTOR x = XMVectorModAngles(V); 3538 3539 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). 3540 __n128 sign = vandq_u32(x, g_XMNegativeZero); 3541 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 3542 __n128 absx = vabsq_f32( x ); 3543 __n128 rflx = vsubq_f32(c, x); 3544 __n128 comp = vcleq_f32(absx, g_XMHalfPi); 3545 x = vbslq_f32( comp, x, rflx ); 3546 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); 3547 3548 __n128 x2 = vmulq_f32(x, x); 3549 3550 // Compute polynomial approximation 3551 const XMVECTOR CC1 = g_XMCosCoefficients1; 3552 XMVECTOR Result = vdupq_lane_f32(vget_low_f32(CC1), 0); 3553 3554 const XMVECTOR CC0 = g_XMCosCoefficients0; 3555 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); 3556 Result = vmlaq_f32(vConstants, Result, x2); 3557 3558 vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); 3559 Result = vmlaq_f32(vConstants, Result, x2); 3560 3561 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); 3562 Result = vmlaq_f32(vConstants, Result, x2); 3563 3564 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); 3565 Result = vmlaq_f32(vConstants, Result, x2); 3566 3567 Result = vmlaq_f32(g_XMOne, Result, x2); 3568 Result = vmulq_f32(Result, sign); 3569 return Result; 3570#elif defined(_XM_SSE_INTRINSICS_) 3571 // Map V to x in [-pi,pi]. 3572 XMVECTOR x = XMVectorModAngles(V); 3573 3574 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). 3575 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); 3576 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 3577 __m128 absx = _mm_andnot_ps(sign, x); // |x| 3578 __m128 rflx = _mm_sub_ps(c, x); 3579 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); 3580 __m128 select0 = _mm_and_ps(comp, x); 3581 __m128 select1 = _mm_andnot_ps(comp, rflx); 3582 x = _mm_or_ps(select0, select1); 3583 select0 = _mm_and_ps(comp, g_XMOne); 3584 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); 3585 sign = _mm_or_ps(select0, select1); 3586 3587 __m128 x2 = _mm_mul_ps(x, x); 3588 3589 // Compute polynomial approximation 3590 const XMVECTOR CC1 = g_XMCosCoefficients1; 3591 XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); 3592 __m128 Result = _mm_mul_ps(vConstants, x2); 3593 3594 const XMVECTOR CC0 = g_XMCosCoefficients0; 3595 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); 3596 Result = _mm_add_ps(Result, vConstants); 3597 Result = _mm_mul_ps(Result, x2); 3598 3599 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); 3600 Result = _mm_add_ps(Result, vConstants); 3601 Result = _mm_mul_ps(Result, x2); 3602 3603 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); 3604 Result = _mm_add_ps(Result, vConstants); 3605 Result = _mm_mul_ps(Result, x2); 3606 3607 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); 3608 Result = _mm_add_ps(Result, vConstants); 3609 Result = _mm_mul_ps(Result, x2); 3610 Result = _mm_add_ps(Result, g_XMOne); 3611 Result = _mm_mul_ps(Result, sign); 3612 return Result; 3613#else // _XM_VMX128_INTRINSICS_ 3614#endif // _XM_VMX128_INTRINSICS_ 3615} 3616 3617//------------------------------------------------------------------------------ 3618 3619_Use_decl_annotations_ 3620inline void XMVectorSinCos 3621( 3622 XMVECTOR* pSin, 3623 XMVECTOR* pCos, 3624 FXMVECTOR V 3625) 3626{ 3627 assert(pSin != NULL); 3628 assert(pCos != NULL); 3629 3630 // 11/10-degree minimax approximation 3631 3632#if defined(_XM_NO_INTRINSICS_) 3633 XMVECTOR Sin; 3634 XMVECTOR Cos; 3635 3636 XMScalarSinCos(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]); 3637 XMScalarSinCos(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]); 3638 XMScalarSinCos(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]); 3639 XMScalarSinCos(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]); 3640 3641 *pSin = Sin; 3642 *pCos = Cos; 3643#elif defined(_XM_ARM_NEON_INTRINSICS_) 3644 // Force the value within the bounds of pi 3645 XMVECTOR x = XMVectorModAngles(V); 3646 3647 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). 3648 __n128 sign = vandq_u32(x, g_XMNegativeZero); 3649 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 3650 __n128 absx = vabsq_f32( x ); 3651 __n128 rflx = vsubq_f32(c, x); 3652 __n128 comp = vcleq_f32(absx, g_XMHalfPi); 3653 x = vbslq_f32( comp, x, rflx ); 3654 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); 3655 3656 __n128 x2 = vmulq_f32(x, x); 3657 3658 // Compute polynomial approximation for sine 3659 const XMVECTOR SC1 = g_XMSinCoefficients1; 3660 XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0); 3661 3662 const XMVECTOR SC0 = g_XMSinCoefficients0; 3663 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1); 3664 Result = vmlaq_f32(vConstants, Result, x2); 3665 3666 vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0); 3667 Result = vmlaq_f32(vConstants, Result, x2); 3668 3669 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1); 3670 Result = vmlaq_f32(vConstants, Result, x2); 3671 3672 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0); 3673 Result = vmlaq_f32(vConstants, Result, x2); 3674 3675 Result = vmlaq_f32(g_XMOne, Result, x2); 3676 *pSin = vmulq_f32(Result, x); 3677 3678 // Compute polynomial approximation for cosine 3679 const XMVECTOR CC1 = g_XMCosCoefficients1; 3680 Result = vdupq_lane_f32(vget_low_f32(CC1), 0); 3681 3682 const XMVECTOR CC0 = g_XMCosCoefficients0; 3683 vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1); 3684 Result = vmlaq_f32(vConstants, Result, x2); 3685 3686 vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0); 3687 Result = vmlaq_f32(vConstants, Result, x2); 3688 3689 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1); 3690 Result = vmlaq_f32(vConstants, Result, x2); 3691 3692 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0); 3693 Result = vmlaq_f32(vConstants, Result, x2); 3694 3695 Result = vmlaq_f32(g_XMOne, Result, x2); 3696 *pCos = vmulq_f32(Result, sign); 3697#elif defined(_XM_SSE_INTRINSICS_) 3698 // Force the value within the bounds of pi 3699 XMVECTOR x = XMVectorModAngles(V); 3700 3701 // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). 3702 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); 3703 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 3704 __m128 absx = _mm_andnot_ps(sign, x); // |x| 3705 __m128 rflx = _mm_sub_ps(c, x); 3706 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); 3707 __m128 select0 = _mm_and_ps(comp, x); 3708 __m128 select1 = _mm_andnot_ps(comp, rflx); 3709 x = _mm_or_ps(select0, select1); 3710 select0 = _mm_and_ps(comp, g_XMOne); 3711 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); 3712 sign = _mm_or_ps(select0, select1); 3713 3714 __m128 x2 = _mm_mul_ps(x, x); 3715 3716 // Compute polynomial approximation of sine 3717 const XMVECTOR SC1 = g_XMSinCoefficients1; 3718 XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) ); 3719 __m128 Result = _mm_mul_ps(vConstants, x2); 3720 3721 const XMVECTOR SC0 = g_XMSinCoefficients0; 3722 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) ); 3723 Result = _mm_add_ps(Result, vConstants); 3724 Result = _mm_mul_ps(Result, x2); 3725 3726 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) ); 3727 Result = _mm_add_ps(Result, vConstants); 3728 Result = _mm_mul_ps(Result, x2); 3729 3730 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) ); 3731 Result = _mm_add_ps(Result, vConstants); 3732 Result = _mm_mul_ps(Result, x2); 3733 3734 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) ); 3735 Result = _mm_add_ps(Result, vConstants); 3736 Result = _mm_mul_ps(Result, x2); 3737 Result = _mm_add_ps(Result, g_XMOne); 3738 Result = _mm_mul_ps(Result, x); 3739 *pSin = Result; 3740 3741 // Compute polynomial approximation of cosine 3742 const XMVECTOR CC1 = g_XMCosCoefficients1; 3743 vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) ); 3744 Result = _mm_mul_ps(vConstants, x2); 3745 3746 const XMVECTOR CC0 = g_XMCosCoefficients0; 3747 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) ); 3748 Result = _mm_add_ps(Result, vConstants); 3749 Result = _mm_mul_ps(Result, x2); 3750 3751 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) ); 3752 Result = _mm_add_ps(Result, vConstants); 3753 Result = _mm_mul_ps(Result, x2); 3754 3755 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) ); 3756 Result = _mm_add_ps(Result, vConstants); 3757 Result = _mm_mul_ps(Result, x2); 3758 3759 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) ); 3760 Result = _mm_add_ps(Result, vConstants); 3761 Result = _mm_mul_ps(Result, x2); 3762 Result = _mm_add_ps(Result, g_XMOne); 3763 Result = _mm_mul_ps(Result, sign); 3764 *pCos = Result; 3765#else // _XM_VMX128_INTRINSICS_ 3766#endif // _XM_VMX128_INTRINSICS_ 3767} 3768 3769//------------------------------------------------------------------------------ 3770 3771inline XMVECTOR XMVectorTan 3772( 3773 FXMVECTOR V 3774) 3775{ 3776 // Cody and Waite algorithm to compute tangent. 3777 3778#if defined(_XM_NO_INTRINSICS_) 3779 XMVECTOR Result; 3780 Result.vector4_f32[0] = tanf( V.vector4_f32[0] ); 3781 Result.vector4_f32[1] = tanf( V.vector4_f32[1] ); 3782 Result.vector4_f32[2] = tanf( V.vector4_f32[2] ); 3783 Result.vector4_f32[3] = tanf( V.vector4_f32[3] ); 3784 return Result; 3785#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 3786 3787 static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f}; 3788 static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f}; 3789 static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ }; 3790 static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1}; 3791 3792 XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v); 3793 3794 XMVECTOR Zero = XMVectorZero(); 3795 3796 XMVECTOR C0 = XMVectorSplatX(TanConstants.v); 3797 XMVECTOR C1 = XMVectorSplatY(TanConstants.v); 3798 XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v); 3799 3800 XMVECTOR VA = XMVectorMultiply(V, TwoDivPi); 3801 3802 VA = XMVectorRound(VA); 3803 3804 XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V); 3805 3806 XMVECTOR VB = XMVectorAbs(VA); 3807 3808 VC = XMVectorNegativeMultiplySubtract(VA, C1, VC); 3809 3810#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) 3811 VB = vcvtq_u32_f32( VB ); 3812#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_) 3813 reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB); 3814#else 3815 for (size_t i = 0; i < 4; i++) 3816 { 3817 VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i]; 3818 } 3819#endif 3820 3821 XMVECTOR VC2 = XMVectorMultiply(VC, VC); 3822 3823 XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v); 3824 XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v); 3825 XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v); 3826 XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v); 3827 XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v); 3828 XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v); 3829 XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v); 3830 XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v); 3831 3832 XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v); 3833 VBIsEven = XMVectorEqualInt(VBIsEven, Zero); 3834 3835 XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6); 3836 XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3); 3837 N = XMVectorMultiplyAdd(VC2, N, T5); 3838 D = XMVectorMultiplyAdd(VC2, D, T2); 3839 N = XMVectorMultiply(VC2, N); 3840 D = XMVectorMultiplyAdd(VC2, D, T1); 3841 N = XMVectorMultiplyAdd(VC, N, VC); 3842 XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon); 3843 D = XMVectorMultiplyAdd(VC2, D, T0); 3844 3845 N = XMVectorSelect(N, VC, VCNearZero); 3846 D = XMVectorSelect(D, g_XMOne.v, VCNearZero); 3847 3848 XMVECTOR R0 = XMVectorNegate(N); 3849 XMVECTOR R1 = XMVectorDivide(N,D); 3850 R0 = XMVectorDivide(D,R0); 3851 3852 XMVECTOR VIsZero = XMVectorEqual(V, Zero); 3853 3854 XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven); 3855 3856 Result = XMVectorSelect(Result, Zero, VIsZero); 3857 3858 return Result; 3859 3860#else // _XM_VMX128_INTRINSICS_ 3861#endif // _XM_VMX128_INTRINSICS_ 3862} 3863 3864//------------------------------------------------------------------------------ 3865 3866inline XMVECTOR XMVectorSinH 3867( 3868 FXMVECTOR V 3869) 3870{ 3871#if defined(_XM_NO_INTRINSICS_) 3872 XMVECTOR Result; 3873 Result.vector4_f32[0] = sinhf( V.vector4_f32[0] ); 3874 Result.vector4_f32[1] = sinhf( V.vector4_f32[1] ); 3875 Result.vector4_f32[2] = sinhf( V.vector4_f32[2] ); 3876 Result.vector4_f32[3] = sinhf( V.vector4_f32[3] ); 3877 return Result; 3878#elif defined(_XM_ARM_NEON_INTRINSICS_) 3879 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) 3880 3881 XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v ); 3882 XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v ); 3883 XMVECTOR E1 = XMVectorExp(V1); 3884 XMVECTOR E2 = XMVectorExp(V2); 3885 3886 return vsubq_f32(E1, E2); 3887#elif defined(_XM_SSE_INTRINSICS_) 3888 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) 3889 3890 XMVECTOR V1 = _mm_mul_ps(V, Scale); 3891 V1 = _mm_add_ps(V1,g_XMNegativeOne); 3892 XMVECTOR V2 = _mm_mul_ps(V, Scale); 3893 V2 = _mm_sub_ps(g_XMNegativeOne,V2); 3894 XMVECTOR E1 = XMVectorExp(V1); 3895 XMVECTOR E2 = XMVectorExp(V2); 3896 3897 return _mm_sub_ps(E1, E2); 3898#else // _XM_VMX128_INTRINSICS_ 3899#endif // _XM_VMX128_INTRINSICS_ 3900} 3901 3902//------------------------------------------------------------------------------ 3903 3904inline XMVECTOR XMVectorCosH 3905( 3906 FXMVECTOR V 3907) 3908{ 3909#if defined(_XM_NO_INTRINSICS_) 3910 XMVECTOR Result; 3911 Result.vector4_f32[0] = coshf( V.vector4_f32[0] ); 3912 Result.vector4_f32[1] = coshf( V.vector4_f32[1] ); 3913 Result.vector4_f32[2] = coshf( V.vector4_f32[2] ); 3914 Result.vector4_f32[3] = coshf( V.vector4_f32[3] ); 3915 return Result; 3916#elif defined(_XM_ARM_NEON_INTRINSICS_) 3917 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) 3918 3919 XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v); 3920 XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v); 3921 XMVECTOR E1 = XMVectorExp(V1); 3922 XMVECTOR E2 = XMVectorExp(V2); 3923 return vaddq_f32(E1, E2); 3924#elif defined(_XM_SSE_INTRINSICS_) 3925 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f) 3926 3927 XMVECTOR V1 = _mm_mul_ps(V,Scale.v); 3928 V1 = _mm_add_ps(V1,g_XMNegativeOne.v); 3929 XMVECTOR V2 = _mm_mul_ps(V, Scale.v); 3930 V2 = _mm_sub_ps(g_XMNegativeOne.v,V2); 3931 XMVECTOR E1 = XMVectorExp(V1); 3932 XMVECTOR E2 = XMVectorExp(V2); 3933 return _mm_add_ps(E1, E2); 3934#else // _XM_VMX128_INTRINSICS_ 3935#endif // _XM_VMX128_INTRINSICS_ 3936} 3937 3938//------------------------------------------------------------------------------ 3939 3940inline XMVECTOR XMVectorTanH 3941( 3942 FXMVECTOR V 3943) 3944{ 3945#if defined(_XM_NO_INTRINSICS_) 3946 XMVECTOR Result; 3947 Result.vector4_f32[0] = tanhf( V.vector4_f32[0] ); 3948 Result.vector4_f32[1] = tanhf( V.vector4_f32[1] ); 3949 Result.vector4_f32[2] = tanhf( V.vector4_f32[2] ); 3950 Result.vector4_f32[3] = tanhf( V.vector4_f32[3] ); 3951 return Result; 3952#elif defined(_XM_ARM_NEON_INTRINSICS_) 3953 static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) 3954 3955 XMVECTOR E = vmulq_f32(V, Scale.v); 3956 E = XMVectorExp(E); 3957 E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v ); 3958 E = XMVectorReciprocal(E); 3959 return vsubq_f32(g_XMOne.v, E); 3960#elif defined(_XM_SSE_INTRINSICS_) 3961 static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f) 3962 3963 XMVECTOR E = _mm_mul_ps(V, Scale.v); 3964 E = XMVectorExp(E); 3965 E = _mm_mul_ps(E,g_XMOneHalf.v); 3966 E = _mm_add_ps(E,g_XMOneHalf.v); 3967 E = _mm_div_ps(g_XMOne.v,E); 3968 return _mm_sub_ps(g_XMOne.v,E); 3969#else // _XM_VMX128_INTRINSICS_ 3970#endif // _XM_VMX128_INTRINSICS_ 3971} 3972 3973//------------------------------------------------------------------------------ 3974 3975inline XMVECTOR XMVectorASin 3976( 3977 FXMVECTOR V 3978) 3979{ 3980 // 7-degree minimax approximation 3981 3982#if defined(_XM_NO_INTRINSICS_) 3983 XMVECTOR Result; 3984 Result.vector4_f32[0] = XMScalarASin( V.vector4_f32[0] ); 3985 Result.vector4_f32[1] = XMScalarASin( V.vector4_f32[1] ); 3986 Result.vector4_f32[2] = XMScalarASin( V.vector4_f32[2] ); 3987 Result.vector4_f32[3] = XMScalarASin( V.vector4_f32[3] ); 3988 return Result; 3989#elif defined(_XM_ARM_NEON_INTRINSICS_) 3990 __n128 nonnegative = vcgeq_f32(V, g_XMZero); 3991 __n128 x = vabsq_f32(V); 3992 3993 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 3994 __n128 oneMValue = vsubq_f32(g_XMOne, x); 3995 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); 3996 __n128 root = XMVectorSqrt(clampOneMValue); 3997 3998 // Compute polynomial approximation 3999 const XMVECTOR AC1 = g_XMArcCoefficients1; 4000 __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1); 4001 4002 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); 4003 t0 = vmlaq_f32( vConstants, t0, x ); 4004 4005 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); 4006 t0 = vmlaq_f32( vConstants, t0, x ); 4007 4008 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); 4009 t0 = vmlaq_f32( vConstants, t0, x ); 4010 4011 const XMVECTOR AC0 = g_XMArcCoefficients0; 4012 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); 4013 t0 = vmlaq_f32( vConstants, t0, x ); 4014 4015 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); 4016 t0 = vmlaq_f32( vConstants, t0, x ); 4017 4018 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); 4019 t0 = vmlaq_f32( vConstants, t0, x ); 4020 4021 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); 4022 t0 = vmlaq_f32( vConstants, t0, x ); 4023 t0 = vmulq_f32(t0, root); 4024 4025 __n128 t1 = vsubq_f32(g_XMPi, t0); 4026 t0 = vbslq_f32( nonnegative, t0, t1 ); 4027 t0 = vsubq_f32(g_XMHalfPi, t0); 4028 return t0; 4029#elif defined(_XM_SSE_INTRINSICS_) 4030 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); 4031 __m128 mvalue = _mm_sub_ps(g_XMZero, V); 4032 __m128 x = _mm_max_ps(V, mvalue); // |V| 4033 4034 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 4035 __m128 oneMValue = _mm_sub_ps(g_XMOne, x); 4036 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); 4037 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) 4038 4039 // Compute polynomial approximation 4040 const XMVECTOR AC1 = g_XMArcCoefficients1; 4041 XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); 4042 __m128 t0 = _mm_mul_ps(vConstants, x); 4043 4044 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); 4045 t0 = _mm_add_ps(t0, vConstants); 4046 t0 = _mm_mul_ps(t0, x); 4047 4048 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); 4049 t0 = _mm_add_ps(t0, vConstants); 4050 t0 = _mm_mul_ps(t0, x); 4051 4052 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); 4053 t0 = _mm_add_ps(t0, vConstants); 4054 t0 = _mm_mul_ps(t0, x); 4055 4056 const XMVECTOR AC0 = g_XMArcCoefficients0; 4057 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); 4058 t0 = _mm_add_ps(t0, vConstants); 4059 t0 = _mm_mul_ps(t0, x); 4060 4061 vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) ); 4062 t0 = _mm_add_ps(t0, vConstants); 4063 t0 = _mm_mul_ps(t0, x); 4064 4065 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); 4066 t0 = _mm_add_ps(t0, vConstants); 4067 t0 = _mm_mul_ps(t0, x); 4068 4069 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); 4070 t0 = _mm_add_ps(t0, vConstants); 4071 t0 = _mm_mul_ps(t0, root); 4072 4073 __m128 t1 = _mm_sub_ps(g_XMPi, t0); 4074 t0 = _mm_and_ps(nonnegative, t0); 4075 t1 = _mm_andnot_ps(nonnegative, t1); 4076 t0 = _mm_or_ps(t0, t1); 4077 t0 = _mm_sub_ps(g_XMHalfPi, t0); 4078 return t0; 4079#else // _XM_VMX128_INTRINSICS_ 4080#endif // _XM_VMX128_INTRINSICS_ 4081} 4082 4083//------------------------------------------------------------------------------ 4084 4085inline XMVECTOR XMVectorACos 4086( 4087 FXMVECTOR V 4088) 4089{ 4090 // 7-degree minimax approximation 4091 4092#if defined(_XM_NO_INTRINSICS_) 4093 XMVECTOR Result; 4094 Result.vector4_f32[0] = XMScalarACos( V.vector4_f32[0] ); 4095 Result.vector4_f32[1] = XMScalarACos( V.vector4_f32[1] ); 4096 Result.vector4_f32[2] = XMScalarACos( V.vector4_f32[2] ); 4097 Result.vector4_f32[3] = XMScalarACos( V.vector4_f32[3] ); 4098 return Result; 4099#elif defined(_XM_ARM_NEON_INTRINSICS_) 4100 __n128 nonnegative = vcgeq_f32(V, g_XMZero); 4101 __n128 x = vabsq_f32(V); 4102 4103 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 4104 __n128 oneMValue = vsubq_f32(g_XMOne, x); 4105 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); 4106 __n128 root = XMVectorSqrt(clampOneMValue); 4107 4108 // Compute polynomial approximation 4109 const XMVECTOR AC1 = g_XMArcCoefficients1; 4110 __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1); 4111 4112 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0); 4113 t0 = vmlaq_f32( vConstants, t0, x ); 4114 4115 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1); 4116 t0 = vmlaq_f32( vConstants, t0, x ); 4117 4118 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0); 4119 t0 = vmlaq_f32( vConstants, t0, x ); 4120 4121 const XMVECTOR AC0 = g_XMArcCoefficients0; 4122 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1); 4123 t0 = vmlaq_f32( vConstants, t0, x ); 4124 4125 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0); 4126 t0 = vmlaq_f32( vConstants, t0, x ); 4127 4128 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1); 4129 t0 = vmlaq_f32( vConstants, t0, x ); 4130 4131 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0); 4132 t0 = vmlaq_f32( vConstants, t0, x ); 4133 t0 = vmulq_f32(t0, root); 4134 4135 __n128 t1 = vsubq_f32(g_XMPi, t0); 4136 t0 = vbslq_f32( nonnegative, t0, t1 ); 4137 return t0; 4138#elif defined(_XM_SSE_INTRINSICS_) 4139 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); 4140 __m128 mvalue = _mm_sub_ps(g_XMZero, V); 4141 __m128 x = _mm_max_ps(V, mvalue); // |V| 4142 4143 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 4144 __m128 oneMValue = _mm_sub_ps(g_XMOne, x); 4145 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); 4146 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) 4147 4148 // Compute polynomial approximation 4149 const XMVECTOR AC1 = g_XMArcCoefficients1; 4150 XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) ); 4151 __m128 t0 = _mm_mul_ps(vConstants, x); 4152 4153 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) ); 4154 t0 = _mm_add_ps(t0, vConstants); 4155 t0 = _mm_mul_ps(t0, x); 4156 4157 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) ); 4158 t0 = _mm_add_ps(t0, vConstants); 4159 t0 = _mm_mul_ps(t0, x); 4160 4161 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) ); 4162 t0 = _mm_add_ps(t0, vConstants); 4163 t0 = _mm_mul_ps(t0, x); 4164 4165 const XMVECTOR AC0 = g_XMArcCoefficients0; 4166 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) ); 4167 t0 = _mm_add_ps(t0, vConstants); 4168 t0 = _mm_mul_ps(t0, x); 4169 4170 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) ); 4171 t0 = _mm_add_ps(t0, vConstants); 4172 t0 = _mm_mul_ps(t0, x); 4173 4174 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) ); 4175 t0 = _mm_add_ps(t0, vConstants); 4176 t0 = _mm_mul_ps(t0, x); 4177 4178 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) ); 4179 t0 = _mm_add_ps(t0, vConstants); 4180 t0 = _mm_mul_ps(t0, root); 4181 4182 __m128 t1 = _mm_sub_ps(g_XMPi, t0); 4183 t0 = _mm_and_ps(nonnegative, t0); 4184 t1 = _mm_andnot_ps(nonnegative, t1); 4185 t0 = _mm_or_ps(t0, t1); 4186 return t0; 4187#else // _XM_VMX128_INTRINSICS_ 4188#endif // _XM_VMX128_INTRINSICS_ 4189} 4190 4191//------------------------------------------------------------------------------ 4192 4193inline XMVECTOR XMVectorATan 4194( 4195 FXMVECTOR V 4196) 4197{ 4198 // 17-degree minimax approximation 4199 4200#if defined(_XM_NO_INTRINSICS_) 4201 XMVECTOR Result; 4202 Result.vector4_f32[0] = atanf( V.vector4_f32[0] ); 4203 Result.vector4_f32[1] = atanf( V.vector4_f32[1] ); 4204 Result.vector4_f32[2] = atanf( V.vector4_f32[2] ); 4205 Result.vector4_f32[3] = atanf( V.vector4_f32[3] ); 4206 return Result; 4207#elif defined(_XM_ARM_NEON_INTRINSICS_) 4208 __n128 absV = vabsq_f32(V); 4209 __n128 invV = XMVectorReciprocal(V); 4210 __n128 comp = vcgtq_f32(V, g_XMOne); 4211 __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne); 4212 comp = vcleq_f32(absV, g_XMOne); 4213 sign = vbslq_f32(comp, g_XMZero, sign); 4214 __n128 x = vbslq_f32(comp, V, invV); 4215 4216 __n128 x2 = vmulq_f32(x, x); 4217 4218 // Compute polynomial approximation 4219 const XMVECTOR TC1 = g_XMATanCoefficients1; 4220 __n128 Result = vdupq_lane_f32(vget_high_f32(TC1), 1); 4221 4222 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0); 4223 Result = vmlaq_f32( vConstants, Result, x2 ); 4224 4225 vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1); 4226 Result = vmlaq_f32( vConstants, Result, x2 ); 4227 4228 vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0); 4229 Result = vmlaq_f32( vConstants, Result, x2 ); 4230 4231 const XMVECTOR TC0 = g_XMATanCoefficients0; 4232 vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1); 4233 Result = vmlaq_f32( vConstants, Result, x2 ); 4234 4235 vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0); 4236 Result = vmlaq_f32( vConstants, Result, x2 ); 4237 4238 vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1); 4239 Result = vmlaq_f32( vConstants, Result, x2 ); 4240 4241 vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0); 4242 Result = vmlaq_f32( vConstants, Result, x2 ); 4243 4244 Result = vmlaq_f32( g_XMOne, Result, x2 ); 4245 Result = vmulq_f32( Result, x ); 4246 4247 __n128 result1 = vmulq_f32(sign, g_XMHalfPi); 4248 result1 = vsubq_f32(result1, Result); 4249 4250 comp = vceqq_f32(sign, g_XMZero); 4251 Result = vbslq_f32( comp, Result, result1 ); 4252 return Result; 4253#elif defined(_XM_SSE_INTRINSICS_) 4254 __m128 absV = XMVectorAbs(V); 4255 __m128 invV = _mm_div_ps(g_XMOne, V); 4256 __m128 comp = _mm_cmpgt_ps(V, g_XMOne); 4257 __m128 select0 = _mm_and_ps(comp, g_XMOne); 4258 __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); 4259 __m128 sign = _mm_or_ps(select0, select1); 4260 comp = _mm_cmple_ps(absV, g_XMOne); 4261 select0 = _mm_and_ps(comp, g_XMZero); 4262 select1 = _mm_andnot_ps(comp, sign); 4263 sign = _mm_or_ps(select0, select1); 4264 select0 = _mm_and_ps(comp, V); 4265 select1 = _mm_andnot_ps(comp, invV); 4266 __m128 x = _mm_or_ps(select0, select1); 4267 4268 __m128 x2 = _mm_mul_ps(x, x); 4269 4270 // Compute polynomial approximation 4271 const XMVECTOR TC1 = g_XMATanCoefficients1; 4272 XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) ); 4273 __m128 Result = _mm_mul_ps(vConstants, x2); 4274 4275 vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) ); 4276 Result = _mm_add_ps(Result, vConstants); 4277 Result = _mm_mul_ps(Result, x2); 4278 4279 vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) ); 4280 Result = _mm_add_ps(Result, vConstants); 4281 Result = _mm_mul_ps(Result, x2); 4282 4283 vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) ); 4284 Result = _mm_add_ps(Result, vConstants); 4285 Result = _mm_mul_ps(Result, x2); 4286 4287 const XMVECTOR TC0 = g_XMATanCoefficients0; 4288 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) ); 4289 Result = _mm_add_ps(Result, vConstants); 4290 Result = _mm_mul_ps(Result, x2); 4291 4292 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) ); 4293 Result = _mm_add_ps(Result, vConstants); 4294 Result = _mm_mul_ps(Result, x2); 4295 4296 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) ); 4297 Result = _mm_add_ps(Result, vConstants); 4298 Result = _mm_mul_ps(Result, x2); 4299 4300 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) ); 4301 Result = _mm_add_ps(Result, vConstants); 4302 Result = _mm_mul_ps(Result, x2); 4303 Result = _mm_add_ps(Result, g_XMOne); 4304 Result = _mm_mul_ps(Result, x); 4305 __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); 4306 result1 = _mm_sub_ps(result1, Result); 4307 4308 comp = _mm_cmpeq_ps(sign, g_XMZero); 4309 select0 = _mm_and_ps(comp, Result); 4310 select1 = _mm_andnot_ps(comp, result1); 4311 Result = _mm_or_ps(select0, select1); 4312 return Result; 4313#else // _XM_VMX128_INTRINSICS_ 4314#endif // _XM_VMX128_INTRINSICS_ 4315} 4316 4317//------------------------------------------------------------------------------ 4318 4319inline XMVECTOR XMVectorATan2 4320( 4321 FXMVECTOR Y, 4322 FXMVECTOR X 4323) 4324{ 4325 // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions: 4326 4327 // Y == 0 and X is Negative -> Pi with the sign of Y 4328 // y == 0 and x is positive -> 0 with the sign of y 4329 // Y != 0 and X == 0 -> Pi / 2 with the sign of Y 4330 // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y) 4331 // X == -Infinity and Finite Y -> Pi with the sign of Y 4332 // X == +Infinity and Finite Y -> 0 with the sign of Y 4333 // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y 4334 // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y 4335 // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y 4336 4337 static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f}; 4338 4339 XMVECTOR Zero = XMVectorZero(); 4340 XMVECTOR ATanResultValid = XMVectorTrueInt(); 4341 4342 XMVECTOR Pi = XMVectorSplatX(ATan2Constants); 4343 XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); 4344 XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); 4345 XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); 4346 4347 XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); 4348 XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); 4349 XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); 4350 XIsPositive = XMVectorEqualInt(XIsPositive, Zero); 4351 XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); 4352 XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); 4353 4354 XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); 4355 Pi = XMVectorOrInt(Pi, YSign); 4356 PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); 4357 PiOverFour = XMVectorOrInt(PiOverFour, YSign); 4358 ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); 4359 4360 XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); 4361 XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); 4362 XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); 4363 XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); 4364 XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); 4365 XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); 4366 ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); 4367 4368 XMVECTOR V = XMVectorDivide(Y, X); 4369 4370 XMVECTOR R0 = XMVectorATan(V); 4371 4372 R1 = XMVectorSelect( Pi, Zero, XIsPositive ); 4373 R2 = XMVectorAdd(R0, R1); 4374 4375 return XMVectorSelect(Result, R2, ATanResultValid); 4376} 4377 4378//------------------------------------------------------------------------------ 4379 4380inline XMVECTOR XMVectorSinEst 4381( 4382 FXMVECTOR V 4383) 4384{ 4385 // 7-degree minimax approximation 4386 4387#if defined(_XM_NO_INTRINSICS_) 4388 XMVECTOR Result; 4389 Result.vector4_f32[0] = XMScalarSinEst( V.vector4_f32[0] ); 4390 Result.vector4_f32[1] = XMScalarSinEst( V.vector4_f32[1] ); 4391 Result.vector4_f32[2] = XMScalarSinEst( V.vector4_f32[2] ); 4392 Result.vector4_f32[3] = XMScalarSinEst( V.vector4_f32[3] ); 4393 return Result; 4394#elif defined(_XM_ARM_NEON_INTRINSICS_) 4395 // Force the value within the bounds of pi 4396 XMVECTOR x = XMVectorModAngles(V); 4397 4398 // Map in [-pi/2,pi/2] with sin(y) = sin(x). 4399 __n128 sign = vandq_u32(x, g_XMNegativeZero); 4400 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 4401 __n128 absx = vabsq_f32( x ); 4402 __n128 rflx = vsubq_f32(c, x); 4403 __n128 comp = vcleq_f32(absx, g_XMHalfPi); 4404 x = vbslq_f32( comp, x, rflx ); 4405 4406 __n128 x2 = vmulq_f32(x, x); 4407 4408 // Compute polynomial approximation 4409 const XMVECTOR SEC = g_XMSinCoefficients1; 4410 XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1); 4411 4412 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); 4413 Result = vmlaq_f32(vConstants, Result, x2); 4414 4415 vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); 4416 Result = vmlaq_f32(vConstants, Result, x2); 4417 4418 Result = vmlaq_f32(g_XMOne, Result, x2); 4419 Result = vmulq_f32(Result, x); 4420 return Result; 4421#elif defined(_XM_SSE_INTRINSICS_) 4422 // Force the value within the bounds of pi 4423 XMVECTOR x = XMVectorModAngles(V); 4424 4425 // Map in [-pi/2,pi/2] with sin(y) = sin(x). 4426 __m128 sign = _mm_and_ps(x, g_XMNegativeZero); 4427 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 4428 __m128 absx = _mm_andnot_ps(sign, x); // |x| 4429 __m128 rflx = _mm_sub_ps(c, x); 4430 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); 4431 __m128 select0 = _mm_and_ps(comp, x); 4432 __m128 select1 = _mm_andnot_ps(comp, rflx); 4433 x = _mm_or_ps(select0, select1); 4434 4435 __m128 x2 = _mm_mul_ps(x, x); 4436 4437 // Compute polynomial approximation 4438 const XMVECTOR SEC = g_XMSinCoefficients1; 4439 XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); 4440 __m128 Result = _mm_mul_ps(vConstants, x2); 4441 4442 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); 4443 Result = _mm_add_ps(Result, vConstants); 4444 Result = _mm_mul_ps(Result, x2); 4445 4446 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); 4447 Result = _mm_add_ps(Result, vConstants); 4448 Result = _mm_mul_ps(Result, x2); 4449 4450 Result = _mm_add_ps(Result, g_XMOne); 4451 Result = _mm_mul_ps(Result, x); 4452 return Result; 4453#else // _XM_VMX128_INTRINSICS_ 4454#endif // _XM_VMX128_INTRINSICS_ 4455} 4456 4457//------------------------------------------------------------------------------ 4458 4459inline XMVECTOR XMVectorCosEst 4460( 4461 FXMVECTOR V 4462) 4463{ 4464 // 6-degree minimax approximation 4465 4466#if defined(_XM_NO_INTRINSICS_) 4467 XMVECTOR Result; 4468 Result.vector4_f32[0] = XMScalarCosEst( V.vector4_f32[0] ); 4469 Result.vector4_f32[1] = XMScalarCosEst( V.vector4_f32[1] ); 4470 Result.vector4_f32[2] = XMScalarCosEst( V.vector4_f32[2] ); 4471 Result.vector4_f32[3] = XMScalarCosEst( V.vector4_f32[3] ); 4472 return Result; 4473#elif defined(_XM_ARM_NEON_INTRINSICS_) 4474 // Map V to x in [-pi,pi]. 4475 XMVECTOR x = XMVectorModAngles(V); 4476 4477 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). 4478 __n128 sign = vandq_u32(x, g_XMNegativeZero); 4479 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 4480 __n128 absx = vabsq_f32( x ); 4481 __n128 rflx = vsubq_f32(c, x); 4482 __n128 comp = vcleq_f32(absx, g_XMHalfPi); 4483 x = vbslq_f32( comp, x, rflx ); 4484 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); 4485 4486 __n128 x2 = vmulq_f32(x, x); 4487 4488 // Compute polynomial approximation 4489 const XMVECTOR CEC = g_XMCosCoefficients1; 4490 XMVECTOR Result = vdupq_lane_f32(vget_high_f32(CEC), 1); 4491 4492 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); 4493 Result = vmlaq_f32(vConstants, Result, x2); 4494 4495 vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); 4496 Result = vmlaq_f32(vConstants, Result, x2); 4497 4498 Result = vmlaq_f32(g_XMOne, Result, x2); 4499 Result = vmulq_f32(Result, sign); 4500 return Result; 4501#elif defined(_XM_SSE_INTRINSICS_) 4502 // Map V to x in [-pi,pi]. 4503 XMVECTOR x = XMVectorModAngles(V); 4504 4505 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). 4506 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); 4507 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 4508 __m128 absx = _mm_andnot_ps(sign, x); // |x| 4509 __m128 rflx = _mm_sub_ps(c, x); 4510 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); 4511 __m128 select0 = _mm_and_ps(comp, x); 4512 __m128 select1 = _mm_andnot_ps(comp, rflx); 4513 x = _mm_or_ps(select0, select1); 4514 select0 = _mm_and_ps(comp, g_XMOne); 4515 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); 4516 sign = _mm_or_ps(select0, select1); 4517 4518 __m128 x2 = _mm_mul_ps(x, x); 4519 4520 // Compute polynomial approximation 4521 const XMVECTOR CEC = g_XMCosCoefficients1; 4522 XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); 4523 __m128 Result = _mm_mul_ps(vConstants, x2); 4524 4525 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); 4526 Result = _mm_add_ps(Result, vConstants); 4527 Result = _mm_mul_ps(Result, x2); 4528 4529 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); 4530 Result = _mm_add_ps(Result, vConstants); 4531 Result = _mm_mul_ps(Result, x2); 4532 4533 Result = _mm_add_ps(Result, g_XMOne); 4534 Result = _mm_mul_ps(Result, sign); 4535 return Result; 4536#else // _XM_VMX128_INTRINSICS_ 4537#endif // _XM_VMX128_INTRINSICS_ 4538} 4539 4540//------------------------------------------------------------------------------ 4541 4542_Use_decl_annotations_ 4543inline void XMVectorSinCosEst 4544( 4545 XMVECTOR* pSin, 4546 XMVECTOR* pCos, 4547 FXMVECTOR V 4548) 4549{ 4550 assert(pSin != NULL); 4551 assert(pCos != NULL); 4552 4553 // 7/6-degree minimax approximation 4554 4555#if defined(_XM_NO_INTRINSICS_) 4556 XMVECTOR Sin; 4557 XMVECTOR Cos; 4558 4559 XMScalarSinCosEst(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]); 4560 XMScalarSinCosEst(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]); 4561 XMScalarSinCosEst(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]); 4562 XMScalarSinCosEst(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]); 4563 4564 *pSin = Sin; 4565 *pCos = Cos; 4566#elif defined(_XM_ARM_NEON_INTRINSICS_) 4567 // Force the value within the bounds of pi 4568 XMVECTOR x = XMVectorModAngles(V); 4569 4570 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x). 4571 __n128 sign = vandq_u32(x, g_XMNegativeZero); 4572 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 4573 __n128 absx = vabsq_f32( x ); 4574 __n128 rflx = vsubq_f32(c, x); 4575 __n128 comp = vcleq_f32(absx, g_XMHalfPi); 4576 x = vbslq_f32( comp, x, rflx ); 4577 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne ); 4578 4579 __n128 x2 = vmulq_f32(x, x); 4580 4581 // Compute polynomial approximation for sine 4582 const XMVECTOR SEC = g_XMSinCoefficients1; 4583 XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1); 4584 4585 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0); 4586 Result = vmlaq_f32(vConstants, Result, x2); 4587 4588 vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1); 4589 Result = vmlaq_f32(vConstants, Result, x2); 4590 4591 Result = vmlaq_f32(g_XMOne, Result, x2); 4592 *pSin = vmulq_f32(Result, x); 4593 4594 // Compute polynomial approximation 4595 const XMVECTOR CEC = g_XMCosCoefficients1; 4596 Result = vdupq_lane_f32(vget_high_f32(CEC), 1); 4597 4598 vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0); 4599 Result = vmlaq_f32(vConstants, Result, x2); 4600 4601 vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1); 4602 Result = vmlaq_f32(vConstants, Result, x2); 4603 4604 Result = vmlaq_f32(g_XMOne, Result, x2); 4605 *pCos = vmulq_f32(Result, sign); 4606#elif defined(_XM_SSE_INTRINSICS_) 4607 // Force the value within the bounds of pi 4608 XMVECTOR x = XMVectorModAngles(V); 4609 4610 // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x). 4611 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero); 4612 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0 4613 __m128 absx = _mm_andnot_ps(sign, x); // |x| 4614 __m128 rflx = _mm_sub_ps(c, x); 4615 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi); 4616 __m128 select0 = _mm_and_ps(comp, x); 4617 __m128 select1 = _mm_andnot_ps(comp, rflx); 4618 x = _mm_or_ps(select0, select1); 4619 select0 = _mm_and_ps(comp, g_XMOne); 4620 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); 4621 sign = _mm_or_ps(select0, select1); 4622 4623 __m128 x2 = _mm_mul_ps(x, x); 4624 4625 // Compute polynomial approximation for sine 4626 const XMVECTOR SEC = g_XMSinCoefficients1; 4627 XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) ); 4628 __m128 Result = _mm_mul_ps(vConstants, x2); 4629 4630 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) ); 4631 Result = _mm_add_ps(Result, vConstants); 4632 Result = _mm_mul_ps(Result, x2); 4633 4634 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) ); 4635 Result = _mm_add_ps(Result, vConstants); 4636 Result = _mm_mul_ps(Result, x2); 4637 4638 Result = _mm_add_ps(Result, g_XMOne); 4639 Result = _mm_mul_ps(Result, x); 4640 *pSin = Result; 4641 4642 // Compute polynomial approximation for cosine 4643 const XMVECTOR CEC = g_XMCosCoefficients1; 4644 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) ); 4645 Result = _mm_mul_ps(vConstants, x2); 4646 4647 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) ); 4648 Result = _mm_add_ps(Result, vConstants); 4649 Result = _mm_mul_ps(Result, x2); 4650 4651 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) ); 4652 Result = _mm_add_ps(Result, vConstants); 4653 Result = _mm_mul_ps(Result, x2); 4654 4655 Result = _mm_add_ps(Result, g_XMOne); 4656 Result = _mm_mul_ps(Result, sign); 4657 *pCos = Result; 4658#else // _XM_VMX128_INTRINSICS_ 4659#endif // _XM_VMX128_INTRINSICS_ 4660} 4661 4662//------------------------------------------------------------------------------ 4663 4664inline XMVECTOR XMVectorTanEst 4665( 4666 FXMVECTOR V 4667) 4668{ 4669 XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v); 4670 4671 XMVECTOR V1 = XMVectorMultiply(V, OneOverPi); 4672 V1 = XMVectorRound(V1); 4673 4674 V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V); 4675 4676 XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v); 4677 XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v); 4678 XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v); 4679 4680 XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2); 4681 XMVECTOR V2 = XMVectorMultiply(V1, V1); 4682 XMVECTOR V1T0 = XMVectorMultiply(V1, T0); 4683 XMVECTOR V1T1 = XMVectorMultiply(V1, T1); 4684 4685 XMVECTOR D = XMVectorReciprocalEst(V2T2); 4686 XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0); 4687 4688 return XMVectorMultiply(N, D); 4689} 4690 4691 4692//------------------------------------------------------------------------------ 4693 4694inline XMVECTOR XMVectorASinEst 4695( 4696 FXMVECTOR V 4697) 4698{ 4699 // 3-degree minimax approximation 4700 4701#if defined(_XM_NO_INTRINSICS_) 4702 XMVECTOR Result; 4703 Result.vector4_f32[0] = XMScalarASinEst( V.vector4_f32[0] ); 4704 Result.vector4_f32[1] = XMScalarASinEst( V.vector4_f32[1] ); 4705 Result.vector4_f32[2] = XMScalarASinEst( V.vector4_f32[2] ); 4706 Result.vector4_f32[3] = XMScalarASinEst( V.vector4_f32[3] ); 4707 return Result; 4708#elif defined(_XM_ARM_NEON_INTRINSICS_) 4709 __n128 nonnegative = vcgeq_f32(V, g_XMZero); 4710 __n128 x = vabsq_f32(V); 4711 4712 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 4713 __n128 oneMValue = vsubq_f32(g_XMOne, x); 4714 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); 4715 __n128 root = XMVectorSqrt(clampOneMValue); 4716 4717 // Compute polynomial approximation 4718 const XMVECTOR AEC = g_XMArcEstCoefficients; 4719 __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1); 4720 4721 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); 4722 t0 = vmlaq_f32( vConstants, t0, x ); 4723 4724 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); 4725 t0 = vmlaq_f32( vConstants, t0, x ); 4726 4727 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); 4728 t0 = vmlaq_f32( vConstants, t0, x ); 4729 t0 = vmulq_f32(t0, root); 4730 4731 __n128 t1 = vsubq_f32(g_XMPi, t0); 4732 t0 = vbslq_f32( nonnegative, t0, t1 ); 4733 t0 = vsubq_f32(g_XMHalfPi, t0); 4734 return t0; 4735#elif defined(_XM_SSE_INTRINSICS_) 4736 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); 4737 __m128 mvalue = _mm_sub_ps(g_XMZero, V); 4738 __m128 x = _mm_max_ps(V, mvalue); // |V| 4739 4740 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 4741 __m128 oneMValue = _mm_sub_ps(g_XMOne, x); 4742 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); 4743 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) 4744 4745 // Compute polynomial approximation 4746 const XMVECTOR AEC = g_XMArcEstCoefficients; 4747 XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); 4748 __m128 t0 = _mm_mul_ps(vConstants, x); 4749 4750 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); 4751 t0 = _mm_add_ps(t0, vConstants); 4752 t0 = _mm_mul_ps(t0, x); 4753 4754 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); 4755 t0 = _mm_add_ps(t0, vConstants); 4756 t0 = _mm_mul_ps(t0, x); 4757 4758 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); 4759 t0 = _mm_add_ps(t0, vConstants); 4760 t0 = _mm_mul_ps(t0, root); 4761 4762 __m128 t1 = _mm_sub_ps(g_XMPi, t0); 4763 t0 = _mm_and_ps(nonnegative, t0); 4764 t1 = _mm_andnot_ps(nonnegative, t1); 4765 t0 = _mm_or_ps(t0, t1); 4766 t0 = _mm_sub_ps(g_XMHalfPi, t0); 4767 return t0; 4768#else // _XM_VMX128_INTRINSICS_ 4769#endif // _XM_VMX128_INTRINSICS_ 4770} 4771 4772//------------------------------------------------------------------------------ 4773 4774inline XMVECTOR XMVectorACosEst 4775( 4776 FXMVECTOR V 4777) 4778{ 4779 // 3-degree minimax approximation 4780 4781#if defined(_XM_NO_INTRINSICS_) 4782 XMVECTOR Result; 4783 Result.vector4_f32[0] = XMScalarACosEst( V.vector4_f32[0] ); 4784 Result.vector4_f32[1] = XMScalarACosEst( V.vector4_f32[1] ); 4785 Result.vector4_f32[2] = XMScalarACosEst( V.vector4_f32[2] ); 4786 Result.vector4_f32[3] = XMScalarACosEst( V.vector4_f32[3] ); 4787 return Result; 4788#elif defined(_XM_ARM_NEON_INTRINSICS_) 4789 __n128 nonnegative = vcgeq_f32(V, g_XMZero); 4790 __n128 x = vabsq_f32(V); 4791 4792 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 4793 __n128 oneMValue = vsubq_f32(g_XMOne, x); 4794 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue); 4795 __n128 root = XMVectorSqrt(clampOneMValue); 4796 4797 // Compute polynomial approximation 4798 const XMVECTOR AEC = g_XMArcEstCoefficients; 4799 __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1); 4800 4801 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); 4802 t0 = vmlaq_f32( vConstants, t0, x ); 4803 4804 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); 4805 t0 = vmlaq_f32( vConstants, t0, x ); 4806 4807 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0); 4808 t0 = vmlaq_f32( vConstants, t0, x ); 4809 t0 = vmulq_f32(t0, root); 4810 4811 __n128 t1 = vsubq_f32(g_XMPi, t0); 4812 t0 = vbslq_f32( nonnegative, t0, t1 ); 4813 return t0; 4814#elif defined(_XM_SSE_INTRINSICS_) 4815 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero); 4816 __m128 mvalue = _mm_sub_ps(g_XMZero, V); 4817 __m128 x = _mm_max_ps(V, mvalue); // |V| 4818 4819 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number. 4820 __m128 oneMValue = _mm_sub_ps(g_XMOne, x); 4821 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue); 4822 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|) 4823 4824 // Compute polynomial approximation 4825 const XMVECTOR AEC = g_XMArcEstCoefficients; 4826 XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); 4827 __m128 t0 = _mm_mul_ps(vConstants, x); 4828 4829 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); 4830 t0 = _mm_add_ps(t0, vConstants); 4831 t0 = _mm_mul_ps(t0, x); 4832 4833 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); 4834 t0 = _mm_add_ps(t0, vConstants); 4835 t0 = _mm_mul_ps(t0, x); 4836 4837 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); 4838 t0 = _mm_add_ps(t0, vConstants); 4839 t0 = _mm_mul_ps(t0, root); 4840 4841 __m128 t1 = _mm_sub_ps(g_XMPi, t0); 4842 t0 = _mm_and_ps(nonnegative, t0); 4843 t1 = _mm_andnot_ps(nonnegative, t1); 4844 t0 = _mm_or_ps(t0, t1); 4845 return t0; 4846#else // _XM_VMX128_INTRINSICS_ 4847#endif // _XM_VMX128_INTRINSICS_ 4848} 4849 4850//------------------------------------------------------------------------------ 4851 4852namespace Internal 4853{ 4854 4855inline float XMScalarATanEst 4856( 4857 float Value 4858) 4859{ 4860 float y, sign; 4861 if (fabsf(Value) <= 1.0f) 4862 { 4863 y = Value; 4864 sign = 0.0f; 4865 } 4866 else if (Value > 1.0f) 4867 { 4868 y = 1.0f / Value; 4869 sign = 1.0f; 4870 } 4871 else 4872 { 4873 y = 1.0f / Value; 4874 sign = -1.0f; 4875 } 4876 4877 // 9-degree minimax approximation 4878 float y2 = y*y; 4879 float poly = ((((0.0208351f*y2-0.085133f)*y2+0.180141f)*y2-0.3302995f)*y2+0.999866f)*y; 4880 4881 return (sign == 0.0f ? poly : sign*XM_PIDIV2 - poly); 4882} 4883 4884}; // namespace Internal 4885 4886//------------------------------------------------------------------------------ 4887 4888inline XMVECTOR XMVectorATanEst 4889( 4890 FXMVECTOR V 4891) 4892{ 4893 // 9-degree minimax approximation 4894 4895#if defined(_XM_NO_INTRINSICS_) 4896 XMVECTOR Result; 4897 Result.vector4_f32[0] = Internal::XMScalarATanEst( V.vector4_f32[0] ); 4898 Result.vector4_f32[1] = Internal::XMScalarATanEst( V.vector4_f32[1] ); 4899 Result.vector4_f32[2] = Internal::XMScalarATanEst( V.vector4_f32[2] ); 4900 Result.vector4_f32[3] = Internal::XMScalarATanEst( V.vector4_f32[3] ); 4901 return Result; 4902#elif defined(_XM_ARM_NEON_INTRINSICS_) 4903 __n128 absV = vabsq_f32(V); 4904 __n128 invV = XMVectorReciprocalEst(V); 4905 __n128 comp = vcgtq_f32(V, g_XMOne); 4906 __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne ); 4907 comp = vcleq_f32(absV, g_XMOne); 4908 sign = vbslq_f32(comp, g_XMZero, sign ); 4909 __n128 x = vbslq_f32(comp, V, invV ); 4910 4911 __n128 x2 = vmulq_f32(x, x); 4912 4913 // Compute polynomial approximation 4914 const XMVECTOR AEC = g_XMATanEstCoefficients1; 4915 __n128 Result = vdupq_lane_f32(vget_high_f32(AEC), 1); 4916 4917 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0); 4918 Result = vmlaq_f32( vConstants, Result, x2 ); 4919 4920 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1); 4921 Result = vmlaq_f32( vConstants, Result, x2 ); 4922 4923 vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0); 4924 Result = vmlaq_f32( vConstants, Result, x2 ); 4925 4926 // ATanEstCoefficients0 is already splatted 4927 Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 ); 4928 Result = vmulq_f32( Result, x ); 4929 4930 float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi); 4931 result1 = vsubq_f32(result1, Result); 4932 4933 comp = vceqq_f32(sign, g_XMZero); 4934 Result = vbslq_f32( comp, Result, result1 ); 4935 return Result; 4936#elif defined(_XM_SSE_INTRINSICS_) 4937 __m128 absV = XMVectorAbs(V); 4938 __m128 invV = _mm_div_ps(g_XMOne, V); 4939 __m128 comp = _mm_cmpgt_ps(V, g_XMOne); 4940 __m128 select0 = _mm_and_ps(comp, g_XMOne); 4941 __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne); 4942 __m128 sign = _mm_or_ps(select0, select1); 4943 comp = _mm_cmple_ps(absV, g_XMOne); 4944 select0 = _mm_and_ps(comp, g_XMZero); 4945 select1 = _mm_andnot_ps(comp, sign); 4946 sign = _mm_or_ps(select0, select1); 4947 select0 = _mm_and_ps(comp, V); 4948 select1 = _mm_andnot_ps(comp, invV); 4949 __m128 x = _mm_or_ps(select0, select1); 4950 4951 __m128 x2 = _mm_mul_ps(x, x); 4952 4953 // Compute polynomial approximation 4954 const XMVECTOR AEC = g_XMATanEstCoefficients1; 4955 XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) ); 4956 __m128 Result = _mm_mul_ps(vConstants, x2); 4957 4958 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) ); 4959 Result = _mm_add_ps(Result, vConstants); 4960 Result = _mm_mul_ps(Result, x2); 4961 4962 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) ); 4963 Result = _mm_add_ps(Result, vConstants); 4964 Result = _mm_mul_ps(Result, x2); 4965 4966 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) ); 4967 Result = _mm_add_ps(Result, vConstants); 4968 Result = _mm_mul_ps(Result, x2); 4969 4970 // ATanEstCoefficients0 is already splatted 4971 Result = _mm_add_ps(Result, g_XMATanEstCoefficients0); 4972 Result = _mm_mul_ps(Result, x); 4973 __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi); 4974 result1 = _mm_sub_ps(result1, Result); 4975 4976 comp = _mm_cmpeq_ps(sign, g_XMZero); 4977 select0 = _mm_and_ps(comp, Result); 4978 select1 = _mm_andnot_ps(comp, result1); 4979 Result = _mm_or_ps(select0, select1); 4980 return Result; 4981#else // _XM_VMX128_INTRINSICS_ 4982#endif // _XM_VMX128_INTRINSICS_ 4983} 4984 4985//------------------------------------------------------------------------------ 4986 4987inline XMVECTOR XMVectorATan2Est 4988( 4989 FXMVECTOR Y, 4990 FXMVECTOR X 4991) 4992{ 4993 static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */}; 4994 4995 const XMVECTOR Zero = XMVectorZero(); 4996 XMVECTOR ATanResultValid = XMVectorTrueInt(); 4997 4998 XMVECTOR Pi = XMVectorSplatX(ATan2Constants); 4999 XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants); 5000 XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants); 5001 XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants); 5002 5003 XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero); 5004 XMVECTOR XEqualsZero = XMVectorEqual(X, Zero); 5005 XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v); 5006 XIsPositive = XMVectorEqualInt(XIsPositive, Zero); 5007 XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y); 5008 XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X); 5009 5010 XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v); 5011 Pi = XMVectorOrInt(Pi, YSign); 5012 PiOverTwo = XMVectorOrInt(PiOverTwo, YSign); 5013 PiOverFour = XMVectorOrInt(PiOverFour, YSign); 5014 ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign); 5015 5016 XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive); 5017 XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero); 5018 XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero); 5019 XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive); 5020 XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity); 5021 XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity); 5022 ATanResultValid = XMVectorEqualInt(Result, ATanResultValid); 5023 5024 XMVECTOR Reciprocal = XMVectorReciprocalEst(X); 5025 XMVECTOR V = XMVectorMultiply(Y, Reciprocal); 5026 XMVECTOR R0 = XMVectorATanEst(V); 5027 5028 R1 = XMVectorSelect( Pi, Zero, XIsPositive ); 5029 R2 = XMVectorAdd(R0, R1); 5030 5031 Result = XMVectorSelect(Result, R2, ATanResultValid); 5032 5033 return Result; 5034} 5035 5036//------------------------------------------------------------------------------ 5037 5038inline XMVECTOR XMVectorLerp 5039( 5040 FXMVECTOR V0, 5041 FXMVECTOR V1, 5042 float t 5043) 5044{ 5045 // V0 + t * (V1 - V0) 5046 5047#if defined(_XM_NO_INTRINSICS_) 5048 5049 XMVECTOR Scale = XMVectorReplicate(t); 5050 XMVECTOR Length = XMVectorSubtract(V1, V0); 5051 return XMVectorMultiplyAdd(Length, Scale, V0); 5052 5053#elif defined(_XM_ARM_NEON_INTRINSICS_) 5054 XMVECTOR L = vsubq_f32( V1, V0 ); 5055 return vmlaq_n_f32( V0, L, t ); 5056#elif defined(_XM_SSE_INTRINSICS_) 5057 XMVECTOR L = _mm_sub_ps( V1, V0 ); 5058 XMVECTOR S = _mm_set_ps1( t ); 5059 XMVECTOR Result = _mm_mul_ps( L, S ); 5060 return _mm_add_ps( Result, V0 ); 5061#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 5062#endif // _XM_VMX128_INTRINSICS_ 5063} 5064 5065//------------------------------------------------------------------------------ 5066 5067inline XMVECTOR XMVectorLerpV 5068( 5069 FXMVECTOR V0, 5070 FXMVECTOR V1, 5071 FXMVECTOR T 5072) 5073{ 5074 // V0 + T * (V1 - V0) 5075 5076#if defined(_XM_NO_INTRINSICS_) 5077 5078 XMVECTOR Length = XMVectorSubtract(V1, V0); 5079 return XMVectorMultiplyAdd(Length, T, V0); 5080 5081#elif defined(_XM_ARM_NEON_INTRINSICS_) 5082 XMVECTOR L = vsubq_f32( V1, V0 ); 5083 return vmlaq_f32( V0, L, T ); 5084#elif defined(_XM_SSE_INTRINSICS_) 5085 XMVECTOR Length = _mm_sub_ps( V1, V0 ); 5086 XMVECTOR Result = _mm_mul_ps( Length, T ); 5087 return _mm_add_ps( Result, V0 ); 5088#else // _XM_VMX128_INTRINSICS_ 5089#endif // _XM_VMX128_INTRINSICS_ 5090} 5091 5092//------------------------------------------------------------------------------ 5093 5094inline XMVECTOR XMVectorHermite 5095( 5096 FXMVECTOR Position0, 5097 FXMVECTOR Tangent0, 5098 FXMVECTOR Position1, 5099 GXMVECTOR Tangent1, 5100 float t 5101) 5102{ 5103 // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + 5104 // (t^3 - 2 * t^2 + t) * Tangent0 + 5105 // (-2 * t^3 + 3 * t^2) * Position1 + 5106 // (t^3 - t^2) * Tangent1 5107 5108#if defined(_XM_NO_INTRINSICS_) 5109 5110 float t2 = t * t; 5111 float t3 = t * t2; 5112 5113 XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f); 5114 XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t); 5115 XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2); 5116 XMVECTOR T1 = XMVectorReplicate(t3 - t2); 5117 5118 XMVECTOR Result = XMVectorMultiply(P0, Position0); 5119 Result = XMVectorMultiplyAdd(T0, Tangent0, Result); 5120 Result = XMVectorMultiplyAdd(P1, Position1, Result); 5121 Result = XMVectorMultiplyAdd(T1, Tangent1, Result); 5122 5123 return Result; 5124 5125#elif defined(_XM_ARM_NEON_INTRINSICS_) 5126 float t2 = t * t; 5127 float t3 = t * t2; 5128 5129 XMVECTOR P0 = vdupq_n_f32(2.0f * t3 - 3.0f * t2 + 1.0f); 5130 XMVECTOR T0 = vdupq_n_f32(t3 - 2.0f * t2 + t); 5131 XMVECTOR P1 = vdupq_n_f32(-2.0f * t3 + 3.0f * t2); 5132 XMVECTOR T1 = vdupq_n_f32(t3 - t2); 5133 5134 XMVECTOR vResult = vmulq_f32(P0, Position0); 5135 vResult = vmlaq_f32( vResult, T0, Tangent0 ); 5136 vResult = vmlaq_f32( vResult, P1, Position1 ); 5137 vResult = vmlaq_f32( vResult, T1, Tangent1 ); 5138 return vResult; 5139#elif defined(_XM_SSE_INTRINSICS_) 5140 float t2 = t * t; 5141 float t3 = t * t2; 5142 5143 XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f); 5144 XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t); 5145 XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2); 5146 XMVECTOR T1 = _mm_set_ps1(t3 - t2); 5147 5148 XMVECTOR vResult = _mm_mul_ps(P0, Position0); 5149 XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0); 5150 vResult = _mm_add_ps(vResult,vTemp); 5151 vTemp = _mm_mul_ps(P1, Position1); 5152 vResult = _mm_add_ps(vResult,vTemp); 5153 vTemp = _mm_mul_ps(T1, Tangent1); 5154 vResult = _mm_add_ps(vResult,vTemp); 5155 return vResult; 5156#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 5157#endif // _XM_VMX128_INTRINSICS_ 5158} 5159 5160//------------------------------------------------------------------------------ 5161 5162inline XMVECTOR XMVectorHermiteV 5163( 5164 FXMVECTOR Position0, 5165 FXMVECTOR Tangent0, 5166 FXMVECTOR Position1, 5167 GXMVECTOR Tangent1, 5168 CXMVECTOR T 5169) 5170{ 5171 // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 + 5172 // (t^3 - 2 * t^2 + t) * Tangent0 + 5173 // (-2 * t^3 + 3 * t^2) * Position1 + 5174 // (t^3 - t^2) * Tangent1 5175 5176#if defined(_XM_NO_INTRINSICS_) 5177 5178 XMVECTOR T2 = XMVectorMultiply(T, T); 5179 XMVECTOR T3 = XMVectorMultiply(T , T2); 5180 5181 XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f); 5182 XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]); 5183 XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]); 5184 XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]); 5185 5186 XMVECTOR Result = XMVectorMultiply(P0, Position0); 5187 Result = XMVectorMultiplyAdd(T0, Tangent0, Result); 5188 Result = XMVectorMultiplyAdd(P1, Position1, Result); 5189 Result = XMVectorMultiplyAdd(T1, Tangent1, Result); 5190 5191 return Result; 5192 5193#elif defined(_XM_ARM_NEON_INTRINSICS_) 5194 static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; 5195 static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; 5196 5197 XMVECTOR T2 = vmulq_f32(T,T); 5198 XMVECTOR T3 = vmulq_f32(T,T2); 5199 // Mul by the constants against t^2 5200 T2 = vmulq_f32(T2,CatMulT2); 5201 // Mul by the constants against t^3 5202 T3 = vmlaq_f32(T2, T3, CatMulT3 ); 5203 // T3 now has the pre-result. 5204 // I need to add t.y only 5205 T2 = vandq_u32(T,g_XMMaskY); 5206 T3 = vaddq_f32(T3,T2); 5207 // Add 1.0f to x 5208 T3 = vaddq_f32(T3,g_XMIdentityR0); 5209 // Now, I have the constants created 5210 // Mul the x constant to Position0 5211 XMVECTOR vResult = vdupq_lane_f32( vget_low_f32( T3 ), 0 ); // T3[0] 5212 vResult = vmulq_f32(vResult,Position0); 5213 // Mul the y constant to Tangent0 5214 T2 = vdupq_lane_f32( vget_low_f32( T3 ), 1 ); // T3[1] 5215 vResult = vmlaq_f32(vResult, T2, Tangent0 ); 5216 // Mul the z constant to Position1 5217 T2 = vdupq_lane_f32( vget_high_f32( T3 ), 0 ); // T3[2] 5218 vResult = vmlaq_f32(vResult, T2, Position1 ); 5219 // Mul the w constant to Tangent1 5220 T3 = vdupq_lane_f32( vget_high_f32( T3 ), 1 ); // T3[3] 5221 vResult = vmlaq_f32(vResult, T3, Tangent1 ); 5222 return vResult; 5223#elif defined(_XM_SSE_INTRINSICS_) 5224 static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f}; 5225 static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f}; 5226 5227 XMVECTOR T2 = _mm_mul_ps(T,T); 5228 XMVECTOR T3 = _mm_mul_ps(T,T2); 5229 // Mul by the constants against t^2 5230 T2 = _mm_mul_ps(T2,CatMulT2); 5231 // Mul by the constants against t^3 5232 T3 = _mm_mul_ps(T3,CatMulT3); 5233 // T3 now has the pre-result. 5234 T3 = _mm_add_ps(T3,T2); 5235 // I need to add t.y only 5236 T2 = _mm_and_ps(T,g_XMMaskY); 5237 T3 = _mm_add_ps(T3,T2); 5238 // Add 1.0f to x 5239 T3 = _mm_add_ps(T3,g_XMIdentityR0); 5240 // Now, I have the constants created 5241 // Mul the x constant to Position0 5242 XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0)); 5243 vResult = _mm_mul_ps(vResult,Position0); 5244 // Mul the y constant to Tangent0 5245 T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1)); 5246 T2 = _mm_mul_ps(T2,Tangent0); 5247 vResult = _mm_add_ps(vResult,T2); 5248 // Mul the z constant to Position1 5249 T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2)); 5250 T2 = _mm_mul_ps(T2,Position1); 5251 vResult = _mm_add_ps(vResult,T2); 5252 // Mul the w constant to Tangent1 5253 T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3)); 5254 T3 = _mm_mul_ps(T3,Tangent1); 5255 vResult = _mm_add_ps(vResult,T3); 5256 return vResult; 5257#else // _XM_VMX128_INTRINSICS_ 5258#endif // _XM_VMX128_INTRINSICS_ 5259} 5260 5261//------------------------------------------------------------------------------ 5262 5263inline XMVECTOR XMVectorCatmullRom 5264( 5265 FXMVECTOR Position0, 5266 FXMVECTOR Position1, 5267 FXMVECTOR Position2, 5268 GXMVECTOR Position3, 5269 float t 5270) 5271{ 5272 // Result = ((-t^3 + 2 * t^2 - t) * Position0 + 5273 // (3 * t^3 - 5 * t^2 + 2) * Position1 + 5274 // (-3 * t^3 + 4 * t^2 + t) * Position2 + 5275 // (t^3 - t^2) * Position3) * 0.5 5276 5277#if defined(_XM_NO_INTRINSICS_) 5278 5279 float t2 = t * t; 5280 float t3 = t * t2; 5281 5282 XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f); 5283 XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); 5284 XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); 5285 XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f); 5286 5287 XMVECTOR Result = XMVectorMultiply(P0, Position0); 5288 Result = XMVectorMultiplyAdd(P1, Position1, Result); 5289 Result = XMVectorMultiplyAdd(P2, Position2, Result); 5290 Result = XMVectorMultiplyAdd(P3, Position3, Result); 5291 5292 return Result; 5293 5294#elif defined(_XM_ARM_NEON_INTRINSICS_) 5295 float t2 = t * t; 5296 float t3 = t * t2; 5297 5298 XMVECTOR P0 = vdupq_n_f32((-t3 + 2.0f * t2 - t) * 0.5f); 5299 XMVECTOR P1 = vdupq_n_f32((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); 5300 XMVECTOR P2 = vdupq_n_f32((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); 5301 XMVECTOR P3 = vdupq_n_f32((t3 - t2) * 0.5f); 5302 5303 P1 = vmulq_f32(P1, Position1); 5304 P0 = vmlaq_f32(P1, P0, Position0); 5305 P3 = vmulq_f32(P3, Position3); 5306 P2 = vmlaq_f32(P3, P2, Position2); 5307 P0 = vaddq_f32(P0,P2); 5308 return P0; 5309#elif defined(_XM_SSE_INTRINSICS_) 5310 float t2 = t * t; 5311 float t3 = t * t2; 5312 5313 XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f); 5314 XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f); 5315 XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f); 5316 XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f); 5317 5318 P0 = _mm_mul_ps(P0, Position0); 5319 P1 = _mm_mul_ps(P1, Position1); 5320 P2 = _mm_mul_ps(P2, Position2); 5321 P3 = _mm_mul_ps(P3, Position3); 5322 P0 = _mm_add_ps(P0,P1); 5323 P2 = _mm_add_ps(P2,P3); 5324 P0 = _mm_add_ps(P0,P2); 5325 return P0; 5326#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 5327#endif // _XM_VMX128_INTRINSICS_ 5328} 5329 5330//------------------------------------------------------------------------------ 5331 5332inline XMVECTOR XMVectorCatmullRomV 5333( 5334 FXMVECTOR Position0, 5335 FXMVECTOR Position1, 5336 FXMVECTOR Position2, 5337 GXMVECTOR Position3, 5338 CXMVECTOR T 5339) 5340{ 5341#if defined(_XM_NO_INTRINSICS_) 5342 float fx = T.vector4_f32[0]; 5343 float fy = T.vector4_f32[1]; 5344 float fz = T.vector4_f32[2]; 5345 float fw = T.vector4_f32[3]; 5346 XMVECTOR vResult = { 5347 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]+ 5348 (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]+ 5349 (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]+ 5350 (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]), 5351 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]+ 5352 (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]+ 5353 (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]+ 5354 (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]), 5355 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]+ 5356 (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]+ 5357 (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]+ 5358 (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]), 5359 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]+ 5360 (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]+ 5361 (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]+ 5362 (fw*fw*fw-fw*fw)*Position3.vector4_f32[3]) 5363 }; 5364 return vResult; 5365#elif defined(_XM_ARM_NEON_INTRINSICS_) 5366 static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; 5367 static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; 5368 static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; 5369 static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; 5370 // Cache T^2 and T^3 5371 XMVECTOR T2 = vmulq_f32(T,T); 5372 XMVECTOR T3 = vmulq_f32(T,T2); 5373 // Perform the Position0 term 5374 XMVECTOR vResult = vaddq_f32(T2,T2); 5375 vResult = vsubq_f32(vResult,T); 5376 vResult = vsubq_f32(vResult,T3); 5377 vResult = vmulq_f32(vResult,Position0); 5378 // Perform the Position1 term and add 5379 XMVECTOR vTemp = vmulq_f32(T3,Catmul3); 5380 vTemp = vmlsq_f32(vTemp, T2, Catmul5); 5381 vTemp = vaddq_f32(vTemp,Catmul2); 5382 vResult = vmlaq_f32(vResult, vTemp, Position1); 5383 // Perform the Position2 term and add 5384 vTemp = vmulq_f32(T2,Catmul4); 5385 vTemp = vmlsq_f32(vTemp, T3, Catmul3); 5386 vTemp = vaddq_f32(vTemp,T); 5387 vResult = vmlaq_f32(vResult, vTemp, Position2); 5388 // Position3 is the last term 5389 T3 = vsubq_f32(T3,T2); 5390 vResult = vmlaq_f32(vResult, T3, Position3); 5391 // Multiply by 0.5f and exit 5392 vResult = vmulq_f32(vResult,g_XMOneHalf); 5393 return vResult; 5394#elif defined(_XM_SSE_INTRINSICS_) 5395 static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f}; 5396 static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f}; 5397 static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f}; 5398 static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f}; 5399 // Cache T^2 and T^3 5400 XMVECTOR T2 = _mm_mul_ps(T,T); 5401 XMVECTOR T3 = _mm_mul_ps(T,T2); 5402 // Perform the Position0 term 5403 XMVECTOR vResult = _mm_add_ps(T2,T2); 5404 vResult = _mm_sub_ps(vResult,T); 5405 vResult = _mm_sub_ps(vResult,T3); 5406 vResult = _mm_mul_ps(vResult,Position0); 5407 // Perform the Position1 term and add 5408 XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3); 5409 XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5); 5410 vTemp = _mm_sub_ps(vTemp,vTemp2); 5411 vTemp = _mm_add_ps(vTemp,Catmul2); 5412 vTemp = _mm_mul_ps(vTemp,Position1); 5413 vResult = _mm_add_ps(vResult,vTemp); 5414 // Perform the Position2 term and add 5415 vTemp = _mm_mul_ps(T2,Catmul4); 5416 vTemp2 = _mm_mul_ps(T3,Catmul3); 5417 vTemp = _mm_sub_ps(vTemp,vTemp2); 5418 vTemp = _mm_add_ps(vTemp,T); 5419 vTemp = _mm_mul_ps(vTemp,Position2); 5420 vResult = _mm_add_ps(vResult,vTemp); 5421 // Position3 is the last term 5422 T3 = _mm_sub_ps(T3,T2); 5423 T3 = _mm_mul_ps(T3,Position3); 5424 vResult = _mm_add_ps(vResult,T3); 5425 // Multiply by 0.5f and exit 5426 vResult = _mm_mul_ps(vResult,g_XMOneHalf); 5427 return vResult; 5428#else // _XM_VMX128_INTRINSICS_ 5429#endif // _XM_VMX128_INTRINSICS_ 5430} 5431 5432//------------------------------------------------------------------------------ 5433 5434inline XMVECTOR XMVectorBaryCentric 5435( 5436 FXMVECTOR Position0, 5437 FXMVECTOR Position1, 5438 FXMVECTOR Position2, 5439 float f, 5440 float g 5441) 5442{ 5443 // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) 5444 5445#if defined(_XM_NO_INTRINSICS_) 5446 5447 XMVECTOR P10 = XMVectorSubtract(Position1, Position0); 5448 XMVECTOR ScaleF = XMVectorReplicate(f); 5449 5450 XMVECTOR P20 = XMVectorSubtract(Position2, Position0); 5451 XMVECTOR ScaleG = XMVectorReplicate(g); 5452 5453 XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0); 5454 Result = XMVectorMultiplyAdd(P20, ScaleG, Result); 5455 5456 return Result; 5457 5458#elif defined(_XM_ARM_NEON_INTRINSICS_) 5459 XMVECTOR R1 = vsubq_f32(Position1,Position0); 5460 XMVECTOR SF = vdupq_n_f32(f); 5461 XMVECTOR R2 = vsubq_f32(Position2,Position0); 5462 XMVECTOR SG = vdupq_n_f32(g); 5463 R1 = vmlaq_f32( Position0, R1, SF); 5464 return vmlaq_f32( R1, R2, SG ); 5465#elif defined(_XM_SSE_INTRINSICS_) 5466 XMVECTOR R1 = _mm_sub_ps(Position1,Position0); 5467 XMVECTOR SF = _mm_set_ps1(f); 5468 XMVECTOR R2 = _mm_sub_ps(Position2,Position0); 5469 XMVECTOR SG = _mm_set_ps1(g); 5470 R1 = _mm_mul_ps(R1,SF); 5471 R2 = _mm_mul_ps(R2,SG); 5472 R1 = _mm_add_ps(R1,Position0); 5473 R1 = _mm_add_ps(R1,R2); 5474 return R1; 5475#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 5476#endif // _XM_VMX128_INTRINSICS_ 5477} 5478 5479//------------------------------------------------------------------------------ 5480 5481inline XMVECTOR XMVectorBaryCentricV 5482( 5483 FXMVECTOR Position0, 5484 FXMVECTOR Position1, 5485 FXMVECTOR Position2, 5486 GXMVECTOR F, 5487 CXMVECTOR G 5488) 5489{ 5490 // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0) 5491 5492#if defined(_XM_NO_INTRINSICS_) 5493 5494 XMVECTOR P10 = XMVectorSubtract(Position1, Position0); 5495 XMVECTOR P20 = XMVectorSubtract(Position2, Position0); 5496 5497 XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0); 5498 Result = XMVectorMultiplyAdd(P20, G, Result); 5499 5500 return Result; 5501 5502#elif defined(_XM_ARM_NEON_INTRINSICS_) 5503 XMVECTOR R1 = vsubq_f32(Position1,Position0); 5504 XMVECTOR R2 = vsubq_f32(Position2,Position0); 5505 R1 = vmlaq_f32( Position0, R1, F ); 5506 return vmlaq_f32( R1, R2, G); 5507#elif defined(_XM_SSE_INTRINSICS_) 5508 XMVECTOR R1 = _mm_sub_ps(Position1,Position0); 5509 XMVECTOR R2 = _mm_sub_ps(Position2,Position0); 5510 R1 = _mm_mul_ps(R1,F); 5511 R2 = _mm_mul_ps(R2,G); 5512 R1 = _mm_add_ps(R1,Position0); 5513 R1 = _mm_add_ps(R1,R2); 5514 return R1; 5515#else // _XM_VMX128_INTRINSICS_ 5516#endif // _XM_VMX128_INTRINSICS_ 5517} 5518 5519/**************************************************************************** 5520 * 5521 * 2D Vector 5522 * 5523 ****************************************************************************/ 5524 5525//------------------------------------------------------------------------------ 5526// Comparison operations 5527//------------------------------------------------------------------------------ 5528 5529//------------------------------------------------------------------------------ 5530 5531inline bool XMVector2Equal 5532( 5533 FXMVECTOR V1, 5534 FXMVECTOR V2 5535) 5536{ 5537#if defined(_XM_NO_INTRINSICS_) 5538 return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0); 5539#elif defined(_XM_ARM_NEON_INTRINSICS_) 5540 __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5541 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); 5542#elif defined(_XM_SSE_INTRINSICS_) 5543 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 5544// z and w are don't care 5545 return (((_mm_movemask_ps(vTemp)&3)==3) != 0); 5546#else // _XM_VMX128_INTRINSICS_ 5547 return XMComparisonAllTrue(XMVector2EqualR(V1, V2)); 5548#endif 5549} 5550 5551 5552//------------------------------------------------------------------------------ 5553 5554inline uint32_t XMVector2EqualR 5555( 5556 FXMVECTOR V1, 5557 FXMVECTOR V2 5558) 5559{ 5560#if defined(_XM_NO_INTRINSICS_) 5561 5562 uint32_t CR = 0; 5563 if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 5564 (V1.vector4_f32[1] == V2.vector4_f32[1])) 5565 { 5566 CR = XM_CRMASK_CR6TRUE; 5567 } 5568 else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 5569 (V1.vector4_f32[1] != V2.vector4_f32[1])) 5570 { 5571 CR = XM_CRMASK_CR6FALSE; 5572 } 5573 return CR; 5574 5575#elif defined(_XM_ARM_NEON_INTRINSICS_) 5576 __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5577 uint64_t r = vget_lane_u64( vTemp, 0 ); 5578 uint32_t CR = 0; 5579 if ( r == 0xFFFFFFFFFFFFFFFFU ) 5580 { 5581 CR = XM_CRMASK_CR6TRUE; 5582 } 5583 else if ( !r ) 5584 { 5585 CR = XM_CRMASK_CR6FALSE; 5586 } 5587 return CR; 5588#elif defined(_XM_SSE_INTRINSICS_) 5589 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 5590// z and w are don't care 5591 int iTest = _mm_movemask_ps(vTemp)&3; 5592 uint32_t CR = 0; 5593 if (iTest==3) 5594 { 5595 CR = XM_CRMASK_CR6TRUE; 5596 } 5597 else if (!iTest) 5598 { 5599 CR = XM_CRMASK_CR6FALSE; 5600 } 5601 return CR; 5602#else // _XM_VMX128_INTRINSICS_ 5603#endif // _XM_VMX128_INTRINSICS_ 5604} 5605 5606//------------------------------------------------------------------------------ 5607 5608inline bool XMVector2EqualInt 5609( 5610 FXMVECTOR V1, 5611 FXMVECTOR V2 5612) 5613{ 5614#if defined(_XM_NO_INTRINSICS_) 5615 return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0); 5616#elif defined(_XM_ARM_NEON_INTRINSICS_) 5617 __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); 5618 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); 5619#elif defined(_XM_SSE_INTRINSICS_) 5620 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 5621 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0); 5622#else // _XM_VMX128_INTRINSICS_ 5623 return XMComparisonAllTrue(XMVector2EqualIntR(V1, V2)); 5624#endif 5625} 5626 5627//------------------------------------------------------------------------------ 5628 5629inline uint32_t XMVector2EqualIntR 5630( 5631 FXMVECTOR V1, 5632 FXMVECTOR V2 5633) 5634{ 5635#if defined(_XM_NO_INTRINSICS_) 5636 5637 uint32_t CR = 0; 5638 if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 5639 (V1.vector4_u32[1] == V2.vector4_u32[1])) 5640 { 5641 CR = XM_CRMASK_CR6TRUE; 5642 } 5643 else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 5644 (V1.vector4_u32[1] != V2.vector4_u32[1])) 5645 { 5646 CR = XM_CRMASK_CR6FALSE; 5647 } 5648 return CR; 5649 5650#elif defined(_XM_ARM_NEON_INTRINSICS_) 5651 __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); 5652 uint64_t r = vget_lane_u64( vTemp, 0 ); 5653 uint32_t CR = 0; 5654 if ( r == 0xFFFFFFFFFFFFFFFFU ) 5655 { 5656 CR = XM_CRMASK_CR6TRUE; 5657 } 5658 else if ( !r ) 5659 { 5660 CR = XM_CRMASK_CR6FALSE; 5661 } 5662 return CR; 5663#elif defined(_XM_SSE_INTRINSICS_) 5664 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 5665 int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3; 5666 uint32_t CR = 0; 5667 if (iTest==3) 5668 { 5669 CR = XM_CRMASK_CR6TRUE; 5670 } 5671 else if (!iTest) 5672 { 5673 CR = XM_CRMASK_CR6FALSE; 5674 } 5675 return CR; 5676#else // _XM_VMX128_INTRINSICS_ 5677#endif // _XM_VMX128_INTRINSICS_ 5678} 5679 5680//------------------------------------------------------------------------------ 5681 5682inline bool XMVector2NearEqual 5683( 5684 FXMVECTOR V1, 5685 FXMVECTOR V2, 5686 FXMVECTOR Epsilon 5687) 5688{ 5689#if defined(_XM_NO_INTRINSICS_) 5690 float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); 5691 float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); 5692 return ((dx <= Epsilon.vector4_f32[0]) && 5693 (dy <= Epsilon.vector4_f32[1])); 5694#elif defined(_XM_ARM_NEON_INTRINSICS_) 5695 __n64 vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2)); 5696 __n64 vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) ); 5697 uint64_t r = vget_lane_u64( vTemp, 0 ); 5698 return ( r == 0xFFFFFFFFFFFFFFFFU ); 5699#elif defined(_XM_SSE_INTRINSICS_) 5700 // Get the difference 5701 XMVECTOR vDelta = _mm_sub_ps(V1,V2); 5702 // Get the absolute value of the difference 5703 XMVECTOR vTemp = _mm_setzero_ps(); 5704 vTemp = _mm_sub_ps(vTemp,vDelta); 5705 vTemp = _mm_max_ps(vTemp,vDelta); 5706 vTemp = _mm_cmple_ps(vTemp,Epsilon); 5707 // z and w are don't care 5708 return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0); 5709#else // _XM_VMX128_INTRINSICS_ 5710#endif // _XM_VMX128_INTRINSICS_ 5711} 5712 5713//------------------------------------------------------------------------------ 5714 5715inline bool XMVector2NotEqual 5716( 5717 FXMVECTOR V1, 5718 FXMVECTOR V2 5719) 5720{ 5721#if defined(_XM_NO_INTRINSICS_) 5722 return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0); 5723#elif defined(_XM_ARM_NEON_INTRINSICS_) 5724 __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5725 return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); 5726#elif defined(_XM_SSE_INTRINSICS_) 5727 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 5728// z and w are don't care 5729 return (((_mm_movemask_ps(vTemp)&3)!=3) != 0); 5730#else // _XM_VMX128_INTRINSICS_ 5731 return XMComparisonAnyFalse(XMVector2EqualR(V1, V2)); 5732#endif 5733} 5734 5735//------------------------------------------------------------------------------ 5736 5737inline bool XMVector2NotEqualInt 5738( 5739 FXMVECTOR V1, 5740 FXMVECTOR V2 5741) 5742{ 5743#if defined(_XM_NO_INTRINSICS_) 5744 return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0); 5745#elif defined(_XM_ARM_NEON_INTRINSICS_) 5746 __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) ); 5747 return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU ); 5748#elif defined(_XM_SSE_INTRINSICS_) 5749 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 5750 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0); 5751#else // _XM_VMX128_INTRINSICS_ 5752 return XMComparisonAnyFalse(XMVector2EqualIntR(V1, V2)); 5753#endif 5754} 5755 5756//------------------------------------------------------------------------------ 5757 5758inline bool XMVector2Greater 5759( 5760 FXMVECTOR V1, 5761 FXMVECTOR V2 5762) 5763{ 5764#if defined(_XM_NO_INTRINSICS_) 5765 return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0); 5766#elif defined(_XM_ARM_NEON_INTRINSICS_) 5767 __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5768 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); 5769#elif defined(_XM_SSE_INTRINSICS_) 5770 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); 5771// z and w are don't care 5772 return (((_mm_movemask_ps(vTemp)&3)==3) != 0); 5773#else // _XM_VMX128_INTRINSICS_ 5774 return XMComparisonAllTrue(XMVector2GreaterR(V1, V2)); 5775#endif 5776} 5777 5778//------------------------------------------------------------------------------ 5779 5780inline uint32_t XMVector2GreaterR 5781( 5782 FXMVECTOR V1, 5783 FXMVECTOR V2 5784) 5785{ 5786#if defined(_XM_NO_INTRINSICS_) 5787 5788 uint32_t CR = 0; 5789 if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 5790 (V1.vector4_f32[1] > V2.vector4_f32[1])) 5791 { 5792 CR = XM_CRMASK_CR6TRUE; 5793 } 5794 else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 5795 (V1.vector4_f32[1] <= V2.vector4_f32[1])) 5796 { 5797 CR = XM_CRMASK_CR6FALSE; 5798 } 5799 return CR; 5800 5801#elif defined(_XM_ARM_NEON_INTRINSICS_) 5802 __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5803 uint64_t r = vget_lane_u64( vTemp, 0 ); 5804 uint32_t CR = 0; 5805 if ( r == 0xFFFFFFFFFFFFFFFFU ) 5806 { 5807 CR = XM_CRMASK_CR6TRUE; 5808 } 5809 else if ( !r ) 5810 { 5811 CR = XM_CRMASK_CR6FALSE; 5812 } 5813 return CR; 5814#elif defined(_XM_SSE_INTRINSICS_) 5815 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); 5816 int iTest = _mm_movemask_ps(vTemp)&3; 5817 uint32_t CR = 0; 5818 if (iTest==3) 5819 { 5820 CR = XM_CRMASK_CR6TRUE; 5821 } 5822 else if (!iTest) 5823 { 5824 CR = XM_CRMASK_CR6FALSE; 5825 } 5826 return CR; 5827#else // _XM_VMX128_INTRINSICS_ 5828#endif // _XM_VMX128_INTRINSICS_ 5829} 5830 5831//------------------------------------------------------------------------------ 5832 5833inline bool XMVector2GreaterOrEqual 5834( 5835 FXMVECTOR V1, 5836 FXMVECTOR V2 5837) 5838{ 5839#if defined(_XM_NO_INTRINSICS_) 5840 return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0); 5841#elif defined(_XM_ARM_NEON_INTRINSICS_) 5842 __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5843 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); 5844#elif defined(_XM_SSE_INTRINSICS_) 5845 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); 5846 return (((_mm_movemask_ps(vTemp)&3)==3) != 0); 5847#else // _XM_VMX128_INTRINSICS_ 5848 return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V1, V2)); 5849#endif 5850} 5851 5852//------------------------------------------------------------------------------ 5853 5854inline uint32_t XMVector2GreaterOrEqualR 5855( 5856 FXMVECTOR V1, 5857 FXMVECTOR V2 5858) 5859{ 5860#if defined(_XM_NO_INTRINSICS_) 5861 5862 uint32_t CR = 0; 5863 if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 5864 (V1.vector4_f32[1] >= V2.vector4_f32[1])) 5865 { 5866 CR = XM_CRMASK_CR6TRUE; 5867 } 5868 else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 5869 (V1.vector4_f32[1] < V2.vector4_f32[1])) 5870 { 5871 CR = XM_CRMASK_CR6FALSE; 5872 } 5873 return CR; 5874 5875#elif defined(_XM_ARM_NEON_INTRINSICS_) 5876 __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5877 uint64_t r = vget_lane_u64( vTemp, 0 ); 5878 uint32_t CR = 0; 5879 if ( r == 0xFFFFFFFFFFFFFFFFU ) 5880 { 5881 CR = XM_CRMASK_CR6TRUE; 5882 } 5883 else if ( !r ) 5884 { 5885 CR = XM_CRMASK_CR6FALSE; 5886 } 5887 return CR; 5888#elif defined(_XM_SSE_INTRINSICS_) 5889 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); 5890 int iTest = _mm_movemask_ps(vTemp)&3; 5891 uint32_t CR = 0; 5892 if (iTest == 3) 5893 { 5894 CR = XM_CRMASK_CR6TRUE; 5895 } 5896 else if (!iTest) 5897 { 5898 CR = XM_CRMASK_CR6FALSE; 5899 } 5900 return CR; 5901#else // _XM_VMX128_INTRINSICS_ 5902#endif // _XM_VMX128_INTRINSICS_ 5903} 5904 5905//------------------------------------------------------------------------------ 5906 5907inline bool XMVector2Less 5908( 5909 FXMVECTOR V1, 5910 FXMVECTOR V2 5911) 5912{ 5913#if defined(_XM_NO_INTRINSICS_) 5914 return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0); 5915#elif defined(_XM_ARM_NEON_INTRINSICS_) 5916 __n64 vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5917 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); 5918#elif defined(_XM_SSE_INTRINSICS_) 5919 XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); 5920 return (((_mm_movemask_ps(vTemp)&3)==3) != 0); 5921#else // _XM_VMX128_INTRINSICS_ 5922 return XMComparisonAllTrue(XMVector2GreaterR(V2, V1)); 5923#endif 5924} 5925 5926//------------------------------------------------------------------------------ 5927 5928inline bool XMVector2LessOrEqual 5929( 5930 FXMVECTOR V1, 5931 FXMVECTOR V2 5932) 5933{ 5934#if defined(_XM_NO_INTRINSICS_) 5935 return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0); 5936#elif defined(_XM_ARM_NEON_INTRINSICS_) 5937 __n64 vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) ); 5938 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU ); 5939#elif defined(_XM_SSE_INTRINSICS_) 5940 XMVECTOR vTemp = _mm_cmple_ps(V1,V2); 5941 return (((_mm_movemask_ps(vTemp)&3)==3) != 0); 5942#else // _XM_VMX128_INTRINSICS_ 5943 return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V2, V1)); 5944#endif 5945} 5946 5947//------------------------------------------------------------------------------ 5948 5949inline bool XMVector2InBounds 5950( 5951 FXMVECTOR V, 5952 FXMVECTOR Bounds 5953) 5954{ 5955#if defined(_XM_NO_INTRINSICS_) 5956 return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 5957 (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0); 5958#elif defined(_XM_ARM_NEON_INTRINSICS_) 5959 __n64 VL = vget_low_f32( V ); 5960 __n64 B = vget_low_f32( Bounds ); 5961 // Test if less than or equal 5962 __n64 vTemp1 = vcle_f32(VL,B); 5963 // Negate the bounds 5964 __n64 vTemp2 = vneg_f32(B); 5965 // Test if greater or equal (Reversed) 5966 vTemp2 = vcle_f32(vTemp2,VL); 5967 // Blend answers 5968 vTemp1 = vand_u32(vTemp1,vTemp2); 5969 // x and y in bounds? 5970 return ( vget_lane_u64( vTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU ); 5971#elif defined(_XM_SSE_INTRINSICS_) 5972 // Test if less than or equal 5973 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); 5974 // Negate the bounds 5975 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); 5976 // Test if greater or equal (Reversed) 5977 vTemp2 = _mm_cmple_ps(vTemp2,V); 5978 // Blend answers 5979 vTemp1 = _mm_and_ps(vTemp1,vTemp2); 5980 // x and y in bounds? (z and w are don't care) 5981 return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0); 5982#else // _XM_VMX128_INTRINSICS_ 5983 return XMComparisonAllInBounds(XMVector2InBoundsR(V, Bounds)); 5984#endif 5985} 5986 5987 5988//------------------------------------------------------------------------------ 5989 5990inline bool XMVector2IsNaN 5991( 5992 FXMVECTOR V 5993) 5994{ 5995#if defined(_XM_NO_INTRINSICS_) 5996 return (XMISNAN(V.vector4_f32[0]) || 5997 XMISNAN(V.vector4_f32[1])); 5998#elif defined(_XM_ARM_NEON_INTRINSICS_) 5999 __n64 VL = vget_low_f32( V ); 6000 // Test against itself. NaN is always not equal 6001 __n64 vTempNan = vceq_f32( VL, VL ); 6002 // If x or y are NaN, the mask is zero 6003 return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU ); 6004#elif defined(_XM_SSE_INTRINSICS_) 6005 // Test against itself. NaN is always not equal 6006 XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); 6007 // If x or y are NaN, the mask is non-zero 6008 return ((_mm_movemask_ps(vTempNan)&3) != 0); 6009#else // _XM_VMX128_INTRINSICS_ 6010#endif // _XM_VMX128_INTRINSICS_ 6011} 6012 6013//------------------------------------------------------------------------------ 6014 6015inline bool XMVector2IsInfinite 6016( 6017 FXMVECTOR V 6018) 6019{ 6020#if defined(_XM_NO_INTRINSICS_) 6021 6022 return (XMISINF(V.vector4_f32[0]) || 6023 XMISINF(V.vector4_f32[1])); 6024#elif defined(_XM_ARM_NEON_INTRINSICS_) 6025 // Mask off the sign bit 6026 __n64 vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) ); 6027 // Compare to infinity 6028 vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) ); 6029 // If any are infinity, the signs are true. 6030 return vget_lane_u64( vTemp, 0 ) != 0; 6031#elif defined(_XM_SSE_INTRINSICS_) 6032 // Mask off the sign bit 6033 __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); 6034 // Compare to infinity 6035 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); 6036 // If x or z are infinity, the signs are true. 6037 return ((_mm_movemask_ps(vTemp)&3) != 0); 6038#else // _XM_VMX128_INTRINSICS_ 6039#endif // _XM_VMX128_INTRINSICS_ 6040} 6041 6042//------------------------------------------------------------------------------ 6043// Computation operations 6044//------------------------------------------------------------------------------ 6045 6046//------------------------------------------------------------------------------ 6047 6048inline XMVECTOR XMVector2Dot 6049( 6050 FXMVECTOR V1, 6051 FXMVECTOR V2 6052) 6053{ 6054#if defined(_XM_NO_INTRINSICS_) 6055 6056 XMVECTOR Result; 6057 Result.vector4_f32[0] = 6058 Result.vector4_f32[1] = 6059 Result.vector4_f32[2] = 6060 Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1]; 6061 return Result; 6062 6063#elif defined(_XM_ARM_NEON_INTRINSICS_) 6064 // Perform the dot product on x and y 6065 __n64 vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) ); 6066 vTemp = vpadd_f32( vTemp, vTemp ); 6067 return vcombine_f32( vTemp, vTemp ); 6068#elif defined(_XM_SSE_INTRINSICS_) 6069 // Perform the dot product on x and y 6070 XMVECTOR vLengthSq = _mm_mul_ps(V1,V2); 6071 // vTemp has y splatted 6072 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); 6073 // x+y 6074 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 6075 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 6076 return vLengthSq; 6077#else // _XM_VMX128_INTRINSICS_ 6078#endif // _XM_VMX128_INTRINSICS_ 6079} 6080 6081//------------------------------------------------------------------------------ 6082 6083inline XMVECTOR XMVector2Cross 6084( 6085 FXMVECTOR V1, 6086 FXMVECTOR V2 6087) 6088{ 6089 // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ] 6090 6091#if defined(_XM_NO_INTRINSICS_) 6092 float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]); 6093 XMVECTOR vResult = { 6094 fCross, 6095 fCross, 6096 fCross, 6097 fCross 6098 }; 6099 return vResult; 6100#elif defined(_XM_ARM_NEON_INTRINSICS_) 6101 static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 }; 6102 6103 __n64 vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) ); 6104 vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) ); 6105 vTemp = vpadd_f32( vTemp, vTemp ); 6106 return vcombine_f32( vTemp, vTemp ); 6107#elif defined(_XM_SSE_INTRINSICS_) 6108 // Swap x and y 6109 XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1)); 6110 // Perform the muls 6111 vResult = _mm_mul_ps(vResult,V1); 6112 // Splat y 6113 XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1)); 6114 // Sub the values 6115 vResult = _mm_sub_ss(vResult,vTemp); 6116 // Splat the cross product 6117 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0)); 6118 return vResult; 6119#else // _XM_VMX128_INTRINSICS_ 6120#endif // _XM_VMX128_INTRINSICS_ 6121} 6122 6123//------------------------------------------------------------------------------ 6124 6125inline XMVECTOR XMVector2LengthSq 6126( 6127 FXMVECTOR V 6128) 6129{ 6130 return XMVector2Dot(V, V); 6131} 6132 6133//------------------------------------------------------------------------------ 6134 6135inline XMVECTOR XMVector2ReciprocalLengthEst 6136( 6137 FXMVECTOR V 6138) 6139{ 6140#if defined(_XM_NO_INTRINSICS_) 6141 6142 XMVECTOR Result; 6143 Result = XMVector2LengthSq(V); 6144 Result = XMVectorReciprocalSqrtEst(Result); 6145 return Result; 6146 6147#elif defined(_XM_ARM_NEON_INTRINSICS_) 6148 __n64 VL = vget_low_f32(V); 6149 // Dot2 6150 __n64 vTemp = vmul_f32( VL, VL ); 6151 vTemp = vpadd_f32( vTemp, vTemp ); 6152 // Reciprocal sqrt (estimate) 6153 vTemp = vrsqrte_f32( vTemp ); 6154 return vcombine_f32( vTemp, vTemp ); 6155#elif defined(_XM_SSE_INTRINSICS_) 6156 // Perform the dot product on x and y 6157 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 6158 // vTemp has y splatted 6159 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); 6160 // x+y 6161 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 6162 vLengthSq = _mm_rsqrt_ss(vLengthSq); 6163 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 6164 return vLengthSq; 6165#else // _XM_VMX128_INTRINSICS_ 6166#endif // _XM_VMX128_INTRINSICS_ 6167} 6168 6169//------------------------------------------------------------------------------ 6170 6171inline XMVECTOR XMVector2ReciprocalLength 6172( 6173 FXMVECTOR V 6174) 6175{ 6176#if defined(_XM_NO_INTRINSICS_) 6177 6178 XMVECTOR Result; 6179 Result = XMVector2LengthSq(V); 6180 Result = XMVectorReciprocalSqrt(Result); 6181 return Result; 6182 6183#elif defined(_XM_ARM_NEON_INTRINSICS_) 6184 __n64 VL = vget_low_f32(V); 6185 // Dot2 6186 __n64 vTemp = vmul_f32( VL, VL ); 6187 vTemp = vpadd_f32( vTemp, vTemp ); 6188 // Reciprocal sqrt 6189 __n64 S0 = vrsqrte_f32(vTemp); 6190 __n64 P0 = vmul_f32( vTemp, S0 ); 6191 __n64 R0 = vrsqrts_f32( P0, S0 ); 6192 __n64 S1 = vmul_f32( S0, R0 ); 6193 __n64 P1 = vmul_f32( vTemp, S1 ); 6194 __n64 R1 = vrsqrts_f32( P1, S1 ); 6195 __n64 Result = vmul_f32( S1, R1 ); 6196 return vcombine_f32( Result, Result ); 6197#elif defined(_XM_SSE_INTRINSICS_) 6198 // Perform the dot product on x and y 6199 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 6200 // vTemp has y splatted 6201 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); 6202 // x+y 6203 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 6204 vLengthSq = _mm_sqrt_ss(vLengthSq); 6205 vLengthSq = _mm_div_ss(g_XMOne,vLengthSq); 6206 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 6207 return vLengthSq; 6208#else // _XM_VMX128_INTRINSICS_ 6209#endif // _XM_VMX128_INTRINSICS_ 6210} 6211 6212//------------------------------------------------------------------------------ 6213 6214inline XMVECTOR XMVector2LengthEst 6215( 6216 FXMVECTOR V 6217) 6218{ 6219#if defined(_XM_NO_INTRINSICS_) 6220 6221 XMVECTOR Result; 6222 Result = XMVector2LengthSq(V); 6223 Result = XMVectorSqrtEst(Result); 6224 return Result; 6225 6226#elif defined(_XM_ARM_NEON_INTRINSICS_) 6227 __n64 VL = vget_low_f32(V); 6228 // Dot2 6229 __n64 vTemp = vmul_f32( VL, VL ); 6230 vTemp = vpadd_f32( vTemp, vTemp ); 6231 const __n64 zero = vdup_n_u32(0); 6232 __n64 VEqualsZero = vceq_f32( vTemp, zero ); 6233 // Sqrt (estimate) 6234 __n64 Result = vrsqrte_f32( vTemp ); 6235 Result = vmul_f32( vTemp, Result ); 6236 Result = vbsl_f32( VEqualsZero, zero, Result ); 6237 return vcombine_f32( Result, Result ); 6238#elif defined(_XM_SSE_INTRINSICS_) 6239 // Perform the dot product on x and y 6240 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 6241 // vTemp has y splatted 6242 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); 6243 // x+y 6244 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 6245 vLengthSq = _mm_sqrt_ss(vLengthSq); 6246 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 6247 return vLengthSq; 6248#else // _XM_VMX128_INTRINSICS_ 6249#endif // _XM_VMX128_INTRINSICS_ 6250} 6251 6252//------------------------------------------------------------------------------ 6253 6254inline XMVECTOR XMVector2Length 6255( 6256 FXMVECTOR V 6257) 6258{ 6259#if defined(_XM_NO_INTRINSICS_) 6260 6261 XMVECTOR Result; 6262 Result = XMVector2LengthSq(V); 6263 Result = XMVectorSqrt(Result); 6264 return Result; 6265 6266#elif defined(_XM_ARM_NEON_INTRINSICS_) 6267 __n64 VL = vget_low_f32(V); 6268 // Dot2 6269 __n64 vTemp = vmul_f32( VL, VL ); 6270 vTemp = vpadd_f32( vTemp, vTemp ); 6271 const __n64 zero = vdup_n_u32(0); 6272 __n64 VEqualsZero = vceq_f32( vTemp, zero ); 6273 // Sqrt 6274 __n64 S0 = vrsqrte_f32( vTemp ); 6275 __n64 P0 = vmul_f32( vTemp, S0 ); 6276 __n64 R0 = vrsqrts_f32( P0, S0 ); 6277 __n64 S1 = vmul_f32( S0, R0 ); 6278 __n64 P1 = vmul_f32( vTemp, S1 ); 6279 __n64 R1 = vrsqrts_f32( P1, S1 ); 6280 __n64 Result = vmul_f32( S1, R1 ); 6281 Result = vmul_f32( vTemp, Result ); 6282 Result = vbsl_f32( VEqualsZero, zero, Result ); 6283 return vcombine_f32( Result, Result ); 6284#elif defined(_XM_SSE_INTRINSICS_) 6285 // Perform the dot product on x and y 6286 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 6287 // vTemp has y splatted 6288 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); 6289 // x+y 6290 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 6291 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 6292 vLengthSq = _mm_sqrt_ps(vLengthSq); 6293 return vLengthSq; 6294#else // _XM_VMX128_INTRINSICS_ 6295#endif // _XM_VMX128_INTRINSICS_ 6296} 6297 6298//------------------------------------------------------------------------------ 6299// XMVector2NormalizeEst uses a reciprocal estimate and 6300// returns QNaN on zero and infinite vectors. 6301 6302inline XMVECTOR XMVector2NormalizeEst 6303( 6304 FXMVECTOR V 6305) 6306{ 6307#if defined(_XM_NO_INTRINSICS_) 6308 6309 XMVECTOR Result; 6310 Result = XMVector2ReciprocalLength(V); 6311 Result = XMVectorMultiply(V, Result); 6312 return Result; 6313 6314#elif defined(_XM_ARM_NEON_INTRINSICS_) 6315 __n64 VL = vget_low_f32(V); 6316 // Dot2 6317 __n64 vTemp = vmul_f32( VL, VL ); 6318 vTemp = vpadd_f32( vTemp, vTemp ); 6319 // Reciprocal sqrt (estimate) 6320 vTemp = vrsqrte_f32( vTemp ); 6321 // Normalize 6322 __n64 Result = vmul_f32( VL, vTemp ); 6323 return vcombine_f32( Result, Result ); 6324#elif defined(_XM_SSE_INTRINSICS_) 6325 // Perform the dot product on x and y 6326 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 6327 // vTemp has y splatted 6328 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); 6329 // x+y 6330 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 6331 vLengthSq = _mm_rsqrt_ss(vLengthSq); 6332 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 6333 vLengthSq = _mm_mul_ps(vLengthSq,V); 6334 return vLengthSq; 6335#else // _XM_VMX128_INTRINSICS_ 6336#endif // _XM_VMX128_INTRINSICS_ 6337} 6338 6339//------------------------------------------------------------------------------ 6340 6341inline XMVECTOR XMVector2Normalize 6342( 6343 FXMVECTOR V 6344) 6345{ 6346#if defined(_XM_NO_INTRINSICS_) 6347 6348 XMVECTOR vResult = XMVector2Length( V ); 6349 float fLength = vResult.vector4_f32[0]; 6350 6351 // Prevent divide by zero 6352 if (fLength > 0) { 6353 fLength = 1.0f/fLength; 6354 } 6355 6356 vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; 6357 vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; 6358 vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; 6359 vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; 6360 return vResult; 6361 6362#elif defined(_XM_ARM_NEON_INTRINSICS_) 6363 __n64 VL = vget_low_f32(V); 6364 // Dot2 6365 __n64 vTemp = vmul_f32( VL, VL ); 6366 vTemp = vpadd_f32( vTemp, vTemp ); 6367 __n64 VEqualsZero = vceq_f32( vTemp, vdup_n_u32(0) ); 6368 __n64 VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) ); 6369 // Reciprocal sqrt (2 iterations of Newton-Raphson) 6370 __n64 S0 = vrsqrte_f32( vTemp ); 6371 __n64 P0 = vmul_f32( vTemp, S0 ); 6372 __n64 R0 = vrsqrts_f32( P0, S0 ); 6373 __n64 S1 = vmul_f32( S0, R0 ); 6374 __n64 P1 = vmul_f32( vTemp, S1 ); 6375 __n64 R1 = vrsqrts_f32( P1, S1 ); 6376 vTemp = vmul_f32( S1, R1 ); 6377 // Normalize 6378 __n64 Result = vmul_f32( VL, vTemp ); 6379 Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result ); 6380 Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result ); 6381 return vcombine_f32( Result, Result ); 6382#elif defined(_XM_SSE_INTRINSICS_) 6383 // Perform the dot product on x and y only 6384 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 6385 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1)); 6386 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 6387 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 6388 // Prepare for the division 6389 XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); 6390 // Create zero with a single instruction 6391 XMVECTOR vZeroMask = _mm_setzero_ps(); 6392 // Test for a divide by zero (Must be FP to detect -0.0) 6393 vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); 6394 // Failsafe on zero (Or epsilon) length planes 6395 // If the length is infinity, set the elements to zero 6396 vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); 6397 // Reciprocal mul to perform the normalization 6398 vResult = _mm_div_ps(V,vResult); 6399 // Any that are infinity, set to zero 6400 vResult = _mm_and_ps(vResult,vZeroMask); 6401 // Select qnan or result based on infinite length 6402 XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); 6403 XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); 6404 vResult = _mm_or_ps(vTemp1,vTemp2); 6405 return vResult; 6406#else // _XM_VMX128_INTRINSICS_ 6407#endif // _XM_VMX128_INTRINSICS_ 6408} 6409 6410//------------------------------------------------------------------------------ 6411 6412inline XMVECTOR XMVector2ClampLength 6413( 6414 FXMVECTOR V, 6415 float LengthMin, 6416 float LengthMax 6417) 6418{ 6419 XMVECTOR ClampMax = XMVectorReplicate(LengthMax); 6420 XMVECTOR ClampMin = XMVectorReplicate(LengthMin); 6421 return XMVector2ClampLengthV(V, ClampMin, ClampMax); 6422} 6423 6424//------------------------------------------------------------------------------ 6425 6426inline XMVECTOR XMVector2ClampLengthV 6427( 6428 FXMVECTOR V, 6429 FXMVECTOR LengthMin, 6430 FXMVECTOR LengthMax 6431) 6432{ 6433 assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin))); 6434 assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax))); 6435 assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero)); 6436 assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero)); 6437 assert(XMVector2GreaterOrEqual(LengthMax, LengthMin)); 6438 6439 XMVECTOR LengthSq = XMVector2LengthSq(V); 6440 6441 const XMVECTOR Zero = XMVectorZero(); 6442 6443 XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); 6444 6445 XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); 6446 XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); 6447 6448 XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); 6449 6450 XMVECTOR Normal = XMVectorMultiply(V, RcpLength); 6451 6452 XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); 6453 Length = XMVectorSelect(LengthSq, Length, Select); 6454 Normal = XMVectorSelect(LengthSq, Normal, Select); 6455 6456 XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); 6457 XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); 6458 6459 XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); 6460 ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); 6461 6462 XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); 6463 6464 // Preserve the original vector (with no precision loss) if the length falls within the given range 6465 XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); 6466 Result = XMVectorSelect(Result, V, Control); 6467 6468 return Result; 6469} 6470 6471//------------------------------------------------------------------------------ 6472 6473inline XMVECTOR XMVector2Reflect 6474( 6475 FXMVECTOR Incident, 6476 FXMVECTOR Normal 6477) 6478{ 6479 // Result = Incident - (2 * dot(Incident, Normal)) * Normal 6480 6481 XMVECTOR Result; 6482 Result = XMVector2Dot(Incident, Normal); 6483 Result = XMVectorAdd(Result, Result); 6484 Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); 6485 return Result; 6486} 6487 6488//------------------------------------------------------------------------------ 6489 6490inline XMVECTOR XMVector2Refract 6491( 6492 FXMVECTOR Incident, 6493 FXMVECTOR Normal, 6494 float RefractionIndex 6495) 6496{ 6497 XMVECTOR Index = XMVectorReplicate(RefractionIndex); 6498 return XMVector2RefractV(Incident, Normal, Index); 6499} 6500 6501//------------------------------------------------------------------------------ 6502 6503// Return the refraction of a 2D vector 6504inline XMVECTOR XMVector2RefractV 6505( 6506 FXMVECTOR Incident, 6507 FXMVECTOR Normal, 6508 FXMVECTOR RefractionIndex 6509) 6510{ 6511 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 6512 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) 6513 6514#if defined(_XM_NO_INTRINSICS_) 6515 6516 float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]); 6517 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 6518 float RY = 1.0f-(IDotN*IDotN); 6519 float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]); 6520 RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]); 6521 if (RX>=0.0f) { 6522 RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX))); 6523 } else { 6524 RX = 0.0f; 6525 } 6526 if (RY>=0.0f) { 6527 RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY))); 6528 } else { 6529 RY = 0.0f; 6530 } 6531 6532 XMVECTOR vResult; 6533 vResult.vector4_f32[0] = RX; 6534 vResult.vector4_f32[1] = RY; 6535 vResult.vector4_f32[2] = 0.0f; 6536 vResult.vector4_f32[3] = 0.0f; 6537 return vResult; 6538 6539#elif defined(_XM_ARM_NEON_INTRINSICS_) 6540 __n64 IL = vget_low_f32( Incident ); 6541 __n64 NL = vget_low_f32( Normal ); 6542 __n64 RIL = vget_low_f32( RefractionIndex ); 6543 // Get the 2D Dot product of Incident-Normal 6544 __n64 vTemp = vmul_f32(IL, NL); 6545 __n64 IDotN = vpadd_f32( vTemp, vTemp ); 6546 // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 6547 vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN); 6548 vTemp = vmul_f32(vTemp,RIL); 6549 vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL ); 6550 // If any terms are <=0, sqrt() will fail, punt to zero 6551 __n64 vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) ); 6552 // Sqrt(vTemp) 6553 __n64 S0 = vrsqrte_f32(vTemp); 6554 __n64 P0 = vmul_f32( vTemp, S0 ); 6555 __n64 R0 = vrsqrts_f32( P0, S0 ); 6556 __n64 S1 = vmul_f32( S0, R0 ); 6557 __n64 P1 = vmul_f32( vTemp, S1 ); 6558 __n64 R1 = vrsqrts_f32( P1, S1 ); 6559 __n64 S2 = vmul_f32( S1, R1 ); 6560 vTemp = vmul_f32( vTemp, S2 ); 6561 // R = RefractionIndex * IDotN + sqrt(R) 6562 vTemp = vmla_f32( vTemp, RIL, IDotN ); 6563 // Result = RefractionIndex * Incident - Normal * R 6564 __n64 vResult = vmul_f32(RIL,IL); 6565 vResult = vmls_f32( vResult, vTemp, NL ); 6566 vResult = vand_u32(vResult,vMask); 6567 return vcombine_f32(vResult, vResult); 6568#elif defined(_XM_SSE_INTRINSICS_) 6569 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 6570 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) 6571 // Get the 2D Dot product of Incident-Normal 6572 XMVECTOR IDotN = XMVector2Dot(Incident, Normal); 6573 // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 6574 XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN); 6575 vTemp = _mm_sub_ps(g_XMOne,vTemp); 6576 vTemp = _mm_mul_ps(vTemp,RefractionIndex); 6577 vTemp = _mm_mul_ps(vTemp,RefractionIndex); 6578 vTemp = _mm_sub_ps(g_XMOne,vTemp); 6579 // If any terms are <=0, sqrt() will fail, punt to zero 6580 XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero); 6581 // R = RefractionIndex * IDotN + sqrt(R) 6582 vTemp = _mm_sqrt_ps(vTemp); 6583 XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN); 6584 vTemp = _mm_add_ps(vTemp,vResult); 6585 // Result = RefractionIndex * Incident - Normal * R 6586 vResult = _mm_mul_ps(RefractionIndex,Incident); 6587 vTemp = _mm_mul_ps(vTemp,Normal); 6588 vResult = _mm_sub_ps(vResult,vTemp); 6589 vResult = _mm_and_ps(vResult,vMask); 6590 return vResult; 6591#else // _XM_VMX128_INTRINSICS_ 6592#endif // _XM_VMX128_INTRINSICS_ 6593} 6594 6595//------------------------------------------------------------------------------ 6596 6597inline XMVECTOR XMVector2Orthogonal 6598( 6599 FXMVECTOR V 6600) 6601{ 6602#if defined(_XM_NO_INTRINSICS_) 6603 6604 XMVECTOR Result; 6605 Result.vector4_f32[0] = -V.vector4_f32[1]; 6606 Result.vector4_f32[1] = V.vector4_f32[0]; 6607 Result.vector4_f32[2] = 0.f; 6608 Result.vector4_f32[3] = 0.f; 6609 return Result; 6610 6611#elif defined(_XM_ARM_NEON_INTRINSICS_) 6612 static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 }; 6613 const __n64 zero = vdup_n_f32(0); 6614 6615 __n64 VL = vget_low_f32( V ); 6616 __n64 Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) ); 6617 return vcombine_f32( Result, zero ); 6618#elif defined(_XM_SSE_INTRINSICS_) 6619 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1)); 6620 vResult = _mm_mul_ps(vResult,g_XMNegateX); 6621 return vResult; 6622#else // _XM_VMX128_INTRINSICS_ 6623#endif // _XM_VMX128_INTRINSICS_ 6624} 6625 6626//------------------------------------------------------------------------------ 6627 6628inline XMVECTOR XMVector2AngleBetweenNormalsEst 6629( 6630 FXMVECTOR N1, 6631 FXMVECTOR N2 6632) 6633{ 6634#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6635 6636 XMVECTOR Result = XMVector2Dot(N1, N2); 6637 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); 6638 Result = XMVectorACosEst(Result); 6639 return Result; 6640 6641#else // _XM_VMX128_INTRINSICS_ 6642#endif // _XM_VMX128_INTRINSICS_ 6643} 6644 6645//------------------------------------------------------------------------------ 6646 6647inline XMVECTOR XMVector2AngleBetweenNormals 6648( 6649 FXMVECTOR N1, 6650 FXMVECTOR N2 6651) 6652{ 6653#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6654 6655 XMVECTOR Result = XMVector2Dot(N1, N2); 6656 Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne); 6657 Result = XMVectorACos(Result); 6658 return Result; 6659 6660#else // _XM_VMX128_INTRINSICS_ 6661#endif // _XM_VMX128_INTRINSICS_ 6662} 6663 6664//------------------------------------------------------------------------------ 6665 6666inline XMVECTOR XMVector2AngleBetweenVectors 6667( 6668 FXMVECTOR V1, 6669 FXMVECTOR V2 6670) 6671{ 6672#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6673 6674 XMVECTOR L1 = XMVector2ReciprocalLength(V1); 6675 XMVECTOR L2 = XMVector2ReciprocalLength(V2); 6676 6677 XMVECTOR Dot = XMVector2Dot(V1, V2); 6678 6679 L1 = XMVectorMultiply(L1, L2); 6680 6681 XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); 6682 CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); 6683 6684 return XMVectorACos(CosAngle); 6685 6686#else // _XM_VMX128_INTRINSICS_ 6687#endif // _XM_VMX128_INTRINSICS_ 6688} 6689 6690//------------------------------------------------------------------------------ 6691 6692inline XMVECTOR XMVector2LinePointDistance 6693( 6694 FXMVECTOR LinePoint1, 6695 FXMVECTOR LinePoint2, 6696 FXMVECTOR Point 6697) 6698{ 6699 // Given a vector PointVector from LinePoint1 to Point and a vector 6700 // LineVector from LinePoint1 to LinePoint2, the scaled distance 6701 // PointProjectionScale from LinePoint1 to the perpendicular projection 6702 // of PointVector onto the line is defined as: 6703 // 6704 // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) 6705 6706#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6707 6708 XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); 6709 XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); 6710 6711 XMVECTOR LengthSq = XMVector2LengthSq(LineVector); 6712 6713 XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector); 6714 PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); 6715 6716 XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); 6717 DistanceVector = XMVectorSubtract(PointVector, DistanceVector); 6718 6719 return XMVector2Length(DistanceVector); 6720 6721#else // _XM_VMX128_INTRINSICS_ 6722#endif // _XM_VMX128_INTRINSICS_ 6723} 6724 6725//------------------------------------------------------------------------------ 6726 6727inline XMVECTOR XMVector2IntersectLine 6728( 6729 FXMVECTOR Line1Point1, 6730 FXMVECTOR Line1Point2, 6731 FXMVECTOR Line2Point1, 6732 GXMVECTOR Line2Point2 6733) 6734{ 6735#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6736 6737 XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1); 6738 XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1); 6739 XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1); 6740 6741 XMVECTOR C1 = XMVector2Cross(V1, V2); 6742 XMVECTOR C2 = XMVector2Cross(V2, V3); 6743 6744 XMVECTOR Result; 6745 const XMVECTOR Zero = XMVectorZero(); 6746 if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v)) 6747 { 6748 if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v)) 6749 { 6750 // Coincident 6751 Result = g_XMInfinity.v; 6752 } 6753 else 6754 { 6755 // Parallel 6756 Result = g_XMQNaN.v; 6757 } 6758 } 6759 else 6760 { 6761 // Intersection point = Line1Point1 + V1 * (C2 / C1) 6762 XMVECTOR Scale = XMVectorReciprocal(C1); 6763 Scale = XMVectorMultiply(C2, Scale); 6764 Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1); 6765 } 6766 6767 return Result; 6768 6769#elif defined(_XM_SSE_INTRINSICS_) 6770 XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1); 6771 XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1); 6772 XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1); 6773 // Generate the cross products 6774 XMVECTOR C1 = XMVector2Cross(V1, V2); 6775 XMVECTOR C2 = XMVector2Cross(V2, V3); 6776 // If C1 is not close to epsilon, use the calculated value 6777 XMVECTOR vResultMask = _mm_setzero_ps(); 6778 vResultMask = _mm_sub_ps(vResultMask,C1); 6779 vResultMask = _mm_max_ps(vResultMask,C1); 6780 // 0xFFFFFFFF if the calculated value is to be used 6781 vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon); 6782 // If C1 is close to epsilon, which fail type is it? INFINITY or NAN? 6783 XMVECTOR vFailMask = _mm_setzero_ps(); 6784 vFailMask = _mm_sub_ps(vFailMask,C2); 6785 vFailMask = _mm_max_ps(vFailMask,C2); 6786 vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon); 6787 XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity); 6788 vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN); 6789 // vFail is NAN or INF 6790 vFail = _mm_or_ps(vFail,vFailMask); 6791 // Intersection point = Line1Point1 + V1 * (C2 / C1) 6792 XMVECTOR vResult = _mm_div_ps(C2,C1); 6793 vResult = _mm_mul_ps(vResult,V1); 6794 vResult = _mm_add_ps(vResult,Line1Point1); 6795 // Use result, or failure value 6796 vResult = _mm_and_ps(vResult,vResultMask); 6797 vResultMask = _mm_andnot_ps(vResultMask,vFail); 6798 vResult = _mm_or_ps(vResult,vResultMask); 6799 return vResult; 6800#else // _XM_VMX128_INTRINSICS_ 6801#endif // _XM_VMX128_INTRINSICS_ 6802} 6803 6804//------------------------------------------------------------------------------ 6805 6806inline XMVECTOR XMVector2Transform 6807( 6808 FXMVECTOR V, 6809 CXMMATRIX M 6810) 6811{ 6812#if defined(_XM_NO_INTRINSICS_) 6813 6814 XMVECTOR Y = XMVectorSplatY(V); 6815 XMVECTOR X = XMVectorSplatX(V); 6816 6817 XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); 6818 Result = XMVectorMultiplyAdd(X, M.r[0], Result); 6819 6820 return Result; 6821 6822#elif defined(_XM_ARM_NEON_INTRINSICS_) 6823 __n64 VL = vget_low_f32( V ); 6824 __n128 Y = vdupq_lane_f32( VL, 1 ); 6825 __n128 Result = vmlaq_f32( M.r[3], Y, M.r[1] ); 6826 __n128 X = vdupq_lane_f32( VL, 0 ); 6827 return vmlaq_f32( Result, X, M.r[0] ); 6828#elif defined(_XM_SSE_INTRINSICS_) 6829 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); 6830 vResult = _mm_mul_ps(vResult,M.r[0]); 6831 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 6832 vTemp = _mm_mul_ps(vTemp,M.r[1]); 6833 vResult = _mm_add_ps(vResult,vTemp); 6834 vResult = _mm_add_ps(vResult,M.r[3]); 6835 return vResult; 6836#else // _XM_VMX128_INTRINSICS_ 6837#endif // _XM_VMX128_INTRINSICS_ 6838} 6839 6840//------------------------------------------------------------------------------ 6841 6842_Use_decl_annotations_ 6843inline XMFLOAT4* XMVector2TransformStream 6844( 6845 XMFLOAT4* pOutputStream, 6846 size_t OutputStride, 6847 const XMFLOAT2* pInputStream, 6848 size_t InputStride, 6849 size_t VectorCount, 6850 CXMMATRIX M 6851) 6852{ 6853 assert(pOutputStream != NULL); 6854 assert(pInputStream != NULL); 6855 6856#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6857 6858 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 6859 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 6860 6861 const XMVECTOR row0 = M.r[0]; 6862 const XMVECTOR row1 = M.r[1]; 6863 const XMVECTOR row3 = M.r[3]; 6864 6865 for (size_t i = 0; i < VectorCount; i++) 6866 { 6867 XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); 6868 XMVECTOR Y = XMVectorSplatY(V); 6869 XMVECTOR X = XMVectorSplatX(V); 6870 6871 XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); 6872 Result = XMVectorMultiplyAdd(X, row0, Result); 6873 6874 XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); 6875 6876 pInputVector += InputStride; 6877 pOutputVector += OutputStride; 6878 } 6879 6880 return pOutputStream; 6881 6882#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 6883#endif // _XM_VMX128_INTRINSICS_ 6884} 6885 6886 6887//------------------------------------------------------------------------------ 6888 6889inline XMVECTOR XMVector2TransformCoord 6890( 6891 FXMVECTOR V, 6892 CXMMATRIX M 6893) 6894{ 6895#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6896 6897 XMVECTOR Y = XMVectorSplatY(V); 6898 XMVECTOR X = XMVectorSplatX(V); 6899 6900 XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]); 6901 Result = XMVectorMultiplyAdd(X, M.r[0], Result); 6902 6903 XMVECTOR W = XMVectorSplatW(Result); 6904 return XMVectorDivide( Result, W ); 6905 6906#else // _XM_VMX128_INTRINSICS_ 6907#endif // _XM_VMX128_INTRINSICS_ 6908} 6909 6910//------------------------------------------------------------------------------ 6911 6912_Use_decl_annotations_ 6913inline XMFLOAT2* XMVector2TransformCoordStream 6914( 6915 XMFLOAT2* pOutputStream, 6916 size_t OutputStride, 6917 const XMFLOAT2* pInputStream, 6918 size_t InputStride, 6919 size_t VectorCount, 6920 CXMMATRIX M 6921) 6922{ 6923 assert(pOutputStream != NULL); 6924 assert(pInputStream != NULL); 6925 6926#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 6927 6928 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 6929 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 6930 6931 const XMVECTOR row0 = M.r[0]; 6932 const XMVECTOR row1 = M.r[1]; 6933 const XMVECTOR row3 = M.r[3]; 6934 6935 for (size_t i = 0; i < VectorCount; i++) 6936 { 6937 XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); 6938 XMVECTOR Y = XMVectorSplatY(V); 6939 XMVECTOR X = XMVectorSplatX(V); 6940 6941 XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3); 6942 Result = XMVectorMultiplyAdd(X, row0, Result); 6943 6944 XMVECTOR W = XMVectorSplatW(Result); 6945 6946 Result = XMVectorDivide(Result, W); 6947 6948 XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); 6949 6950 pInputVector += InputStride; 6951 pOutputVector += OutputStride; 6952 } 6953 6954 return pOutputStream; 6955 6956#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 6957#endif // _XM_VMX128_INTRINSICS_ 6958} 6959 6960//------------------------------------------------------------------------------ 6961 6962inline XMVECTOR XMVector2TransformNormal 6963( 6964 FXMVECTOR V, 6965 CXMMATRIX M 6966) 6967{ 6968#if defined(_XM_NO_INTRINSICS_) 6969 6970 XMVECTOR Y = XMVectorSplatY(V); 6971 XMVECTOR X = XMVectorSplatX(V); 6972 6973 XMVECTOR Result = XMVectorMultiply(Y, M.r[1]); 6974 Result = XMVectorMultiplyAdd(X, M.r[0], Result); 6975 6976 return Result; 6977 6978#elif defined(_XM_ARM_NEON_INTRINSICS_) 6979 __n64 VL = vget_low_f32( V ); 6980 __n128 Y = vdupq_lane_f32( VL, 1 ); 6981 __n128 Result = vmulq_f32( Y, M.r[1] ); 6982 __n128 X = vdupq_lane_f32( VL, 0 ); 6983 return vmlaq_f32( Result, X, M.r[0] ); 6984#elif defined(_XM_SSE_INTRINSICS_) 6985 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); 6986 vResult = _mm_mul_ps(vResult,M.r[0]); 6987 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 6988 vTemp = _mm_mul_ps(vTemp,M.r[1]); 6989 vResult = _mm_add_ps(vResult,vTemp); 6990 return vResult; 6991#else // _XM_VMX128_INTRINSICS_ 6992#endif // _XM_VMX128_INTRINSICS_ 6993} 6994 6995//------------------------------------------------------------------------------ 6996 6997_Use_decl_annotations_ 6998inline XMFLOAT2* XMVector2TransformNormalStream 6999( 7000 XMFLOAT2* pOutputStream, 7001 size_t OutputStride, 7002 const XMFLOAT2* pInputStream, 7003 size_t InputStride, 7004 size_t VectorCount, 7005 CXMMATRIX M 7006) 7007{ 7008 assert(pOutputStream != NULL); 7009 assert(pInputStream != NULL); 7010 7011#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 7012 7013 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 7014 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 7015 7016 const XMVECTOR row0 = M.r[0]; 7017 const XMVECTOR row1 = M.r[1]; 7018 7019 for (size_t i = 0; i < VectorCount; i++) 7020 { 7021 XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector); 7022 XMVECTOR Y = XMVectorSplatY(V); 7023 XMVECTOR X = XMVectorSplatX(V); 7024 7025 XMVECTOR Result = XMVectorMultiply(Y, row1); 7026 Result = XMVectorMultiplyAdd(X, row0, Result); 7027 7028 XMStoreFloat2((XMFLOAT2*)pOutputVector, Result); 7029 7030 pInputVector += InputStride; 7031 pOutputVector += OutputStride; 7032 } 7033 7034 return pOutputStream; 7035 7036#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 7037#endif // _XM_VMX128_INTRINSICS_ 7038} 7039 7040/**************************************************************************** 7041 * 7042 * 3D Vector 7043 * 7044 ****************************************************************************/ 7045 7046//------------------------------------------------------------------------------ 7047// Comparison operations 7048//------------------------------------------------------------------------------ 7049 7050//------------------------------------------------------------------------------ 7051 7052inline bool XMVector3Equal 7053( 7054 FXMVECTOR V1, 7055 FXMVECTOR V2 7056) 7057{ 7058#if defined(_XM_NO_INTRINSICS_) 7059 return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0); 7060#elif defined(_XM_ARM_NEON_INTRINSICS_) 7061 __n128 vResult = vceqq_f32( V1, V2 ); 7062 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7063 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7064 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7065#elif defined(_XM_SSE_INTRINSICS_) 7066 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 7067 return (((_mm_movemask_ps(vTemp)&7)==7) != 0); 7068#else // _XM_VMX128_INTRINSICS_ 7069 return XMComparisonAllTrue(XMVector3EqualR(V1, V2)); 7070#endif 7071} 7072 7073//------------------------------------------------------------------------------ 7074 7075inline uint32_t XMVector3EqualR 7076( 7077 FXMVECTOR V1, 7078 FXMVECTOR V2 7079) 7080{ 7081#if defined(_XM_NO_INTRINSICS_) 7082 uint32_t CR = 0; 7083 if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 7084 (V1.vector4_f32[1] == V2.vector4_f32[1]) && 7085 (V1.vector4_f32[2] == V2.vector4_f32[2])) 7086 { 7087 CR = XM_CRMASK_CR6TRUE; 7088 } 7089 else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 7090 (V1.vector4_f32[1] != V2.vector4_f32[1]) && 7091 (V1.vector4_f32[2] != V2.vector4_f32[2])) 7092 { 7093 CR = XM_CRMASK_CR6FALSE; 7094 } 7095 return CR; 7096#elif defined(_XM_ARM_NEON_INTRINSICS_) 7097 __n128 vResult = vceqq_f32( V1, V2 ); 7098 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7099 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7100 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; 7101 7102 uint32_t CR = 0; 7103 if ( r == 0xFFFFFFU ) 7104 { 7105 CR = XM_CRMASK_CR6TRUE; 7106 } 7107 else if ( !r ) 7108 { 7109 CR = XM_CRMASK_CR6FALSE; 7110 } 7111 return CR; 7112#elif defined(_XM_SSE_INTRINSICS_) 7113 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 7114 int iTest = _mm_movemask_ps(vTemp)&7; 7115 uint32_t CR = 0; 7116 if (iTest==7) 7117 { 7118 CR = XM_CRMASK_CR6TRUE; 7119 } 7120 else if (!iTest) 7121 { 7122 CR = XM_CRMASK_CR6FALSE; 7123 } 7124 return CR; 7125#else // _XM_VMX128_INTRINSICS_ 7126#endif // _XM_VMX128_INTRINSICS_ 7127} 7128 7129//------------------------------------------------------------------------------ 7130 7131inline bool XMVector3EqualInt 7132( 7133 FXMVECTOR V1, 7134 FXMVECTOR V2 7135) 7136{ 7137#if defined(_XM_NO_INTRINSICS_) 7138 return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0); 7139#elif defined(_XM_ARM_NEON_INTRINSICS_) 7140 __n128 vResult = vceqq_u32( V1, V2 ); 7141 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7142 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7143 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7144#elif defined(_XM_SSE_INTRINSICS_) 7145 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 7146 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0); 7147#else // _XM_VMX128_INTRINSICS_ 7148 return XMComparisonAllTrue(XMVector3EqualIntR(V1, V2)); 7149#endif 7150} 7151 7152//------------------------------------------------------------------------------ 7153 7154inline uint32_t XMVector3EqualIntR 7155( 7156 FXMVECTOR V1, 7157 FXMVECTOR V2 7158) 7159{ 7160#if defined(_XM_NO_INTRINSICS_) 7161 uint32_t CR = 0; 7162 if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 7163 (V1.vector4_u32[1] == V2.vector4_u32[1]) && 7164 (V1.vector4_u32[2] == V2.vector4_u32[2])) 7165 { 7166 CR = XM_CRMASK_CR6TRUE; 7167 } 7168 else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 7169 (V1.vector4_u32[1] != V2.vector4_u32[1]) && 7170 (V1.vector4_u32[2] != V2.vector4_u32[2])) 7171 { 7172 CR = XM_CRMASK_CR6FALSE; 7173 } 7174 return CR; 7175#elif defined(_XM_ARM_NEON_INTRINSICS_) 7176 __n128 vResult = vceqq_u32( V1, V2 ); 7177 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7178 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7179 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; 7180 7181 uint32_t CR = 0; 7182 if ( r == 0xFFFFFFU ) 7183 { 7184 CR = XM_CRMASK_CR6TRUE; 7185 } 7186 else if ( !r ) 7187 { 7188 CR = XM_CRMASK_CR6FALSE; 7189 } 7190 return CR; 7191#elif defined(_XM_SSE_INTRINSICS_) 7192 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 7193 int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7; 7194 uint32_t CR = 0; 7195 if (iTemp==7) 7196 { 7197 CR = XM_CRMASK_CR6TRUE; 7198 } 7199 else if (!iTemp) 7200 { 7201 CR = XM_CRMASK_CR6FALSE; 7202 } 7203 return CR; 7204#else // _XM_VMX128_INTRINSICS_ 7205#endif // _XM_VMX128_INTRINSICS_ 7206} 7207 7208//------------------------------------------------------------------------------ 7209 7210inline bool XMVector3NearEqual 7211( 7212 FXMVECTOR V1, 7213 FXMVECTOR V2, 7214 FXMVECTOR Epsilon 7215) 7216{ 7217#if defined(_XM_NO_INTRINSICS_) 7218 float dx, dy, dz; 7219 7220 dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); 7221 dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); 7222 dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); 7223 return (((dx <= Epsilon.vector4_f32[0]) && 7224 (dy <= Epsilon.vector4_f32[1]) && 7225 (dz <= Epsilon.vector4_f32[2])) != 0); 7226#elif defined(_XM_ARM_NEON_INTRINSICS_) 7227 __n128 vDelta = vsubq_f32( V1, V2 ); 7228 __n128 vResult = vacleq_f32( vDelta, Epsilon ); 7229 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7230 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7231 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7232#elif defined(_XM_SSE_INTRINSICS_) 7233 // Get the difference 7234 XMVECTOR vDelta = _mm_sub_ps(V1,V2); 7235 // Get the absolute value of the difference 7236 XMVECTOR vTemp = _mm_setzero_ps(); 7237 vTemp = _mm_sub_ps(vTemp,vDelta); 7238 vTemp = _mm_max_ps(vTemp,vDelta); 7239 vTemp = _mm_cmple_ps(vTemp,Epsilon); 7240 // w is don't care 7241 return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0); 7242#else // _XM_VMX128_INTRINSICS_ 7243#endif // _XM_VMX128_INTRINSICS_ 7244} 7245 7246//------------------------------------------------------------------------------ 7247 7248inline bool XMVector3NotEqual 7249( 7250 FXMVECTOR V1, 7251 FXMVECTOR V2 7252) 7253{ 7254#if defined(_XM_NO_INTRINSICS_) 7255 return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0); 7256#elif defined(_XM_ARM_NEON_INTRINSICS_) 7257 __n128 vResult = vceqq_f32( V1, V2 ); 7258 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7259 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7260 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); 7261#elif defined(_XM_SSE_INTRINSICS_) 7262 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 7263 return (((_mm_movemask_ps(vTemp)&7)!=7) != 0); 7264#else // _XM_VMX128_INTRINSICS_ 7265 return XMComparisonAnyFalse(XMVector3EqualR(V1, V2)); 7266#endif 7267} 7268 7269//------------------------------------------------------------------------------ 7270 7271inline bool XMVector3NotEqualInt 7272( 7273 FXMVECTOR V1, 7274 FXMVECTOR V2 7275) 7276{ 7277#if defined(_XM_NO_INTRINSICS_) 7278 return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0); 7279#elif defined(_XM_ARM_NEON_INTRINSICS_) 7280 __n128 vResult = vceqq_u32( V1, V2 ); 7281 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7282 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7283 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); 7284#elif defined(_XM_SSE_INTRINSICS_) 7285 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 7286 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0); 7287#else // _XM_VMX128_INTRINSICS_ 7288 return XMComparisonAnyFalse(XMVector3EqualIntR(V1, V2)); 7289#endif 7290} 7291 7292//------------------------------------------------------------------------------ 7293 7294inline bool XMVector3Greater 7295( 7296 FXMVECTOR V1, 7297 FXMVECTOR V2 7298) 7299{ 7300#if defined(_XM_NO_INTRINSICS_) 7301 return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0); 7302#elif defined(_XM_ARM_NEON_INTRINSICS_) 7303 __n128 vResult = vcgtq_f32( V1, V2 ); 7304 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7305 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7306 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7307#elif defined(_XM_SSE_INTRINSICS_) 7308 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); 7309 return (((_mm_movemask_ps(vTemp)&7)==7) != 0); 7310#else // _XM_VMX128_INTRINSICS_ 7311 return XMComparisonAllTrue(XMVector3GreaterR(V1, V2)); 7312#endif 7313} 7314 7315//------------------------------------------------------------------------------ 7316 7317inline uint32_t XMVector3GreaterR 7318( 7319 FXMVECTOR V1, 7320 FXMVECTOR V2 7321) 7322{ 7323#if defined(_XM_NO_INTRINSICS_) 7324 uint32_t CR = 0; 7325 if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 7326 (V1.vector4_f32[1] > V2.vector4_f32[1]) && 7327 (V1.vector4_f32[2] > V2.vector4_f32[2])) 7328 { 7329 CR = XM_CRMASK_CR6TRUE; 7330 } 7331 else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 7332 (V1.vector4_f32[1] <= V2.vector4_f32[1]) && 7333 (V1.vector4_f32[2] <= V2.vector4_f32[2])) 7334 { 7335 CR = XM_CRMASK_CR6FALSE; 7336 } 7337 return CR; 7338 7339#elif defined(_XM_ARM_NEON_INTRINSICS_) 7340 __n128 vResult = vcgtq_f32( V1, V2 ); 7341 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7342 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7343 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; 7344 7345 uint32_t CR = 0; 7346 if ( r == 0xFFFFFFU ) 7347 { 7348 CR = XM_CRMASK_CR6TRUE; 7349 } 7350 else if ( !r ) 7351 { 7352 CR = XM_CRMASK_CR6FALSE; 7353 } 7354 return CR; 7355#elif defined(_XM_SSE_INTRINSICS_) 7356 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); 7357 uint32_t CR = 0; 7358 int iTest = _mm_movemask_ps(vTemp)&7; 7359 if (iTest==7) 7360 { 7361 CR = XM_CRMASK_CR6TRUE; 7362 } 7363 else if (!iTest) 7364 { 7365 CR = XM_CRMASK_CR6FALSE; 7366 } 7367 return CR; 7368#else // _XM_VMX128_INTRINSICS_ 7369#endif // _XM_VMX128_INTRINSICS_ 7370} 7371 7372//------------------------------------------------------------------------------ 7373 7374inline bool XMVector3GreaterOrEqual 7375( 7376 FXMVECTOR V1, 7377 FXMVECTOR V2 7378) 7379{ 7380#if defined(_XM_NO_INTRINSICS_) 7381 return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0); 7382#elif defined(_XM_ARM_NEON_INTRINSICS_) 7383 __n128 vResult = vcgeq_f32( V1, V2 ); 7384 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7385 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7386 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7387#elif defined(_XM_SSE_INTRINSICS_) 7388 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); 7389 return (((_mm_movemask_ps(vTemp)&7)==7) != 0); 7390#else // _XM_VMX128_INTRINSICS_ 7391 return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V1, V2)); 7392#endif 7393} 7394 7395//------------------------------------------------------------------------------ 7396 7397inline uint32_t XMVector3GreaterOrEqualR 7398( 7399 FXMVECTOR V1, 7400 FXMVECTOR V2 7401) 7402{ 7403#if defined(_XM_NO_INTRINSICS_) 7404 7405 uint32_t CR = 0; 7406 if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 7407 (V1.vector4_f32[1] >= V2.vector4_f32[1]) && 7408 (V1.vector4_f32[2] >= V2.vector4_f32[2])) 7409 { 7410 CR = XM_CRMASK_CR6TRUE; 7411 } 7412 else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 7413 (V1.vector4_f32[1] < V2.vector4_f32[1]) && 7414 (V1.vector4_f32[2] < V2.vector4_f32[2])) 7415 { 7416 CR = XM_CRMASK_CR6FALSE; 7417 } 7418 return CR; 7419 7420#elif defined(_XM_ARM_NEON_INTRINSICS_) 7421 __n128 vResult = vcgeq_f32( V1, V2 ); 7422 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7423 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7424 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU; 7425 7426 uint32_t CR = 0; 7427 if ( r == 0xFFFFFFU ) 7428 { 7429 CR = XM_CRMASK_CR6TRUE; 7430 } 7431 else if ( !r ) 7432 { 7433 CR = XM_CRMASK_CR6FALSE; 7434 } 7435 return CR; 7436#elif defined(_XM_SSE_INTRINSICS_) 7437 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); 7438 uint32_t CR = 0; 7439 int iTest = _mm_movemask_ps(vTemp)&7; 7440 if (iTest==7) 7441 { 7442 CR = XM_CRMASK_CR6TRUE; 7443 } 7444 else if (!iTest) 7445 { 7446 CR = XM_CRMASK_CR6FALSE; 7447 } 7448 return CR; 7449#else // _XM_VMX128_INTRINSICS_ 7450#endif // _XM_VMX128_INTRINSICS_ 7451} 7452 7453//------------------------------------------------------------------------------ 7454 7455inline bool XMVector3Less 7456( 7457 FXMVECTOR V1, 7458 FXMVECTOR V2 7459) 7460{ 7461#if defined(_XM_NO_INTRINSICS_) 7462 return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0); 7463#elif defined(_XM_ARM_NEON_INTRINSICS_) 7464 __n128 vResult = vcltq_f32( V1, V2 ); 7465 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7466 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7467 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7468#elif defined(_XM_SSE_INTRINSICS_) 7469 XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); 7470 return (((_mm_movemask_ps(vTemp)&7)==7) != 0); 7471#else // _XM_VMX128_INTRINSICS_ 7472 return XMComparisonAllTrue(XMVector3GreaterR(V2, V1)); 7473#endif 7474} 7475 7476//------------------------------------------------------------------------------ 7477 7478inline bool XMVector3LessOrEqual 7479( 7480 FXMVECTOR V1, 7481 FXMVECTOR V2 7482) 7483{ 7484#if defined(_XM_NO_INTRINSICS_) 7485 return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0); 7486#elif defined(_XM_ARM_NEON_INTRINSICS_) 7487 __n128 vResult = vcleq_f32( V1, V2 ); 7488 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 7489 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7490 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7491#elif defined(_XM_SSE_INTRINSICS_) 7492 XMVECTOR vTemp = _mm_cmple_ps(V1,V2); 7493 return (((_mm_movemask_ps(vTemp)&7)==7) != 0); 7494#else // _XM_VMX128_INTRINSICS_ 7495 return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V2, V1)); 7496#endif 7497} 7498 7499//------------------------------------------------------------------------------ 7500 7501inline bool XMVector3InBounds 7502( 7503 FXMVECTOR V, 7504 FXMVECTOR Bounds 7505) 7506{ 7507#if defined(_XM_NO_INTRINSICS_) 7508 return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 7509 (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && 7510 (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0); 7511#elif defined(_XM_ARM_NEON_INTRINSICS_) 7512 // Test if less than or equal 7513 __n128 vTemp1 = vcleq_f32(V,Bounds); 7514 // Negate the bounds 7515 __n128 vTemp2 = vnegq_f32(Bounds); 7516 // Test if greater or equal (Reversed) 7517 vTemp2 = vcleq_f32(vTemp2,V); 7518 // Blend answers 7519 vTemp1 = vandq_u32(vTemp1,vTemp2); 7520 // in bounds? 7521 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); 7522 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7523 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU ); 7524#elif defined(_XM_SSE_INTRINSICS_) 7525 // Test if less than or equal 7526 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); 7527 // Negate the bounds 7528 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); 7529 // Test if greater or equal (Reversed) 7530 vTemp2 = _mm_cmple_ps(vTemp2,V); 7531 // Blend answers 7532 vTemp1 = _mm_and_ps(vTemp1,vTemp2); 7533 // x,y and z in bounds? (w is don't care) 7534 return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0); 7535#else 7536 return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds)); 7537#endif 7538} 7539 7540 7541//------------------------------------------------------------------------------ 7542 7543inline bool XMVector3IsNaN 7544( 7545 FXMVECTOR V 7546) 7547{ 7548#if defined(_XM_NO_INTRINSICS_) 7549 7550 return (XMISNAN(V.vector4_f32[0]) || 7551 XMISNAN(V.vector4_f32[1]) || 7552 XMISNAN(V.vector4_f32[2])); 7553 7554#elif defined(_XM_ARM_NEON_INTRINSICS_) 7555 // Test against itself. NaN is always not equal 7556 __n128 vTempNan = vceqq_f32( V, V ); 7557 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); 7558 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7559 // If x or y or z are NaN, the mask is zero 7560 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU ); 7561#elif defined(_XM_SSE_INTRINSICS_) 7562 // Test against itself. NaN is always not equal 7563 XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); 7564 // If x or y or z are NaN, the mask is non-zero 7565 return ((_mm_movemask_ps(vTempNan)&7) != 0); 7566#else // _XM_VMX128_INTRINSICS_ 7567#endif // _XM_VMX128_INTRINSICS_ 7568} 7569 7570//------------------------------------------------------------------------------ 7571 7572inline bool XMVector3IsInfinite 7573( 7574 FXMVECTOR V 7575) 7576{ 7577#if defined(_XM_NO_INTRINSICS_) 7578 return (XMISINF(V.vector4_f32[0]) || 7579 XMISINF(V.vector4_f32[1]) || 7580 XMISINF(V.vector4_f32[2])); 7581#elif defined(_XM_ARM_NEON_INTRINSICS_) 7582 // Mask off the sign bit 7583 __n128 vTempInf = vandq_u32( V, g_XMAbsMask ); 7584 // Compare to infinity 7585 vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); 7586 // If any are infinity, the signs are true. 7587 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); 7588 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 7589 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 ); 7590#elif defined(_XM_SSE_INTRINSICS_) 7591 // Mask off the sign bit 7592 __m128 vTemp = _mm_and_ps(V,g_XMAbsMask); 7593 // Compare to infinity 7594 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); 7595 // If x,y or z are infinity, the signs are true. 7596 return ((_mm_movemask_ps(vTemp)&7) != 0); 7597#else // _XM_VMX128_INTRINSICS_ 7598#endif // _XM_VMX128_INTRINSICS_ 7599} 7600 7601//------------------------------------------------------------------------------ 7602// Computation operations 7603//------------------------------------------------------------------------------ 7604 7605//------------------------------------------------------------------------------ 7606 7607inline XMVECTOR XMVector3Dot 7608( 7609 FXMVECTOR V1, 7610 FXMVECTOR V2 7611) 7612{ 7613#if defined(_XM_NO_INTRINSICS_) 7614 float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2]; 7615 XMVECTOR vResult = { 7616 fValue, 7617 fValue, 7618 fValue, 7619 fValue 7620 }; 7621 return vResult; 7622 7623#elif defined(_XM_ARM_NEON_INTRINSICS_) 7624 __n128 vTemp = vmulq_f32( V1, V2 ); 7625 __n64 v1 = vget_low_f32( vTemp ); 7626 __n64 v2 = vget_high_f32( vTemp ); 7627 v1 = vpadd_f32( v1, v1 ); 7628 v2 = vdup_lane_f32( v2, 0 ); 7629 v1 = vadd_f32( v1, v2 ); 7630 return vcombine_f32( v1, v1 ); 7631#elif defined(_XM_SSE_INTRINSICS_) 7632 // Perform the dot product 7633 XMVECTOR vDot = _mm_mul_ps(V1,V2); 7634 // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2] 7635 XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); 7636 // Result.vector4_f32[0] = x+y 7637 vDot = _mm_add_ss(vDot,vTemp); 7638 // x=Dot.vector4_f32[2] 7639 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); 7640 // Result.vector4_f32[0] = (x+y)+z 7641 vDot = _mm_add_ss(vDot,vTemp); 7642 // Splat x 7643 return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); 7644#else // _XM_VMX128_INTRINSICS_ 7645#endif // _XM_VMX128_INTRINSICS_ 7646} 7647 7648//------------------------------------------------------------------------------ 7649 7650inline XMVECTOR XMVector3Cross 7651( 7652 FXMVECTOR V1, 7653 FXMVECTOR V2 7654) 7655{ 7656 // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ] 7657 7658#if defined(_XM_NO_INTRINSICS_) 7659 XMVECTOR vResult = { 7660 (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]), 7661 (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]), 7662 (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]), 7663 0.0f 7664 }; 7665 return vResult; 7666#elif defined(_XM_ARM_NEON_INTRINSICS_) 7667 __n64 v1xy = vget_low_f32(V1); 7668 __n64 v2xy = vget_low_f32(V2); 7669 7670 __n64 v1yx = vrev64_f32( v1xy ); 7671 __n64 v2yx = vrev64_f32( v2xy ); 7672 7673 __n64 v1zz = vdup_lane_f32( vget_high_f32(V1), 0 ); 7674 __n64 v2zz = vdup_lane_f32( vget_high_f32(V2), 0 ); 7675 7676 __n128 vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) ); 7677 vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) ); 7678 return veorq_u32( vResult, g_XMFlipY ); 7679#elif defined(_XM_SSE_INTRINSICS_) 7680 // y1,z1,x1,w1 7681 XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1)); 7682 // z2,x2,y2,w2 7683 XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2)); 7684 // Perform the left operation 7685 XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2); 7686 // z1,x1,y1,w1 7687 vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1)); 7688 // y2,z2,x2,w2 7689 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2)); 7690 // Perform the right operation 7691 vTemp1 = _mm_mul_ps(vTemp1,vTemp2); 7692 // Subract the right from left, and return answer 7693 vResult = _mm_sub_ps(vResult,vTemp1); 7694 // Set w to zero 7695 return _mm_and_ps(vResult,g_XMMask3); 7696#else // _XM_VMX128_INTRINSICS_ 7697#endif // _XM_VMX128_INTRINSICS_ 7698} 7699 7700//------------------------------------------------------------------------------ 7701 7702inline XMVECTOR XMVector3LengthSq 7703( 7704 FXMVECTOR V 7705) 7706{ 7707 return XMVector3Dot(V, V); 7708} 7709 7710//------------------------------------------------------------------------------ 7711 7712inline XMVECTOR XMVector3ReciprocalLengthEst 7713( 7714 FXMVECTOR V 7715) 7716{ 7717#if defined(_XM_NO_INTRINSICS_) 7718 7719 XMVECTOR Result; 7720 7721 Result = XMVector3LengthSq(V); 7722 Result = XMVectorReciprocalSqrtEst(Result); 7723 7724 return Result; 7725 7726#elif defined(_XM_ARM_NEON_INTRINSICS_) 7727 // Dot3 7728 __n128 vTemp = vmulq_f32( V, V ); 7729 __n64 v1 = vget_low_f32( vTemp ); 7730 __n64 v2 = vget_high_f32( vTemp ); 7731 v1 = vpadd_f32( v1, v1 ); 7732 v2 = vdup_lane_f32( v2, 0 ); 7733 v1 = vadd_f32( v1, v2 ); 7734 // Reciprocal sqrt (estimate) 7735 v2 = vrsqrte_f32( v1 ); 7736 return vcombine_f32(v2, v2); 7737#elif defined(_XM_SSE_INTRINSICS_) 7738 // Perform the dot product on x,y and z 7739 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 7740 // vTemp has z and y 7741 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); 7742 // x+z, y 7743 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 7744 // y,y,y,y 7745 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); 7746 // x+z+y,??,??,?? 7747 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 7748 // Splat the length squared 7749 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 7750 // Get the reciprocal 7751 vLengthSq = _mm_rsqrt_ps(vLengthSq); 7752 return vLengthSq; 7753#else // _XM_VMX128_INTRINSICS_ 7754#endif // _XM_VMX128_INTRINSICS_ 7755} 7756 7757//------------------------------------------------------------------------------ 7758 7759inline XMVECTOR XMVector3ReciprocalLength 7760( 7761 FXMVECTOR V 7762) 7763{ 7764#if defined(_XM_NO_INTRINSICS_) 7765 7766 XMVECTOR Result; 7767 7768 Result = XMVector3LengthSq(V); 7769 Result = XMVectorReciprocalSqrt(Result); 7770 7771 return Result; 7772 7773#elif defined(_XM_ARM_NEON_INTRINSICS_) 7774 // Dot3 7775 __n128 vTemp = vmulq_f32( V, V ); 7776 __n64 v1 = vget_low_f32( vTemp ); 7777 __n64 v2 = vget_high_f32( vTemp ); 7778 v1 = vpadd_f32( v1, v1 ); 7779 v2 = vdup_lane_f32( v2, 0 ); 7780 v1 = vadd_f32( v1, v2 ); 7781 // Reciprocal sqrt 7782 __n64 S0 = vrsqrte_f32(v1); 7783 __n64 P0 = vmul_f32( v1, S0 ); 7784 __n64 R0 = vrsqrts_f32( P0, S0 ); 7785 __n64 S1 = vmul_f32( S0, R0 ); 7786 __n64 P1 = vmul_f32( v1, S1 ); 7787 __n64 R1 = vrsqrts_f32( P1, S1 ); 7788 __n64 Result = vmul_f32( S1, R1 ); 7789 return vcombine_f32( Result, Result ); 7790#elif defined(_XM_SSE_INTRINSICS_) 7791 // Perform the dot product 7792 XMVECTOR vDot = _mm_mul_ps(V,V); 7793 // x=Dot.y, y=Dot.z 7794 XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); 7795 // Result.x = x+y 7796 vDot = _mm_add_ss(vDot,vTemp); 7797 // x=Dot.z 7798 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); 7799 // Result.x = (x+y)+z 7800 vDot = _mm_add_ss(vDot,vTemp); 7801 // Splat x 7802 vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); 7803 // Get the reciprocal 7804 vDot = _mm_sqrt_ps(vDot); 7805 // Get the reciprocal 7806 vDot = _mm_div_ps(g_XMOne,vDot); 7807 return vDot; 7808#else // _XM_VMX128_INTRINSICS_ 7809#endif // _XM_VMX128_INTRINSICS_ 7810} 7811 7812//------------------------------------------------------------------------------ 7813 7814inline XMVECTOR XMVector3LengthEst 7815( 7816 FXMVECTOR V 7817) 7818{ 7819#if defined(_XM_NO_INTRINSICS_) 7820 7821 XMVECTOR Result; 7822 7823 Result = XMVector3LengthSq(V); 7824 Result = XMVectorSqrtEst(Result); 7825 7826 return Result; 7827 7828#elif defined(_XM_ARM_NEON_INTRINSICS_) 7829 // Dot3 7830 __n128 vTemp = vmulq_f32( V, V ); 7831 __n64 v1 = vget_low_f32( vTemp ); 7832 __n64 v2 = vget_high_f32( vTemp ); 7833 v1 = vpadd_f32( v1, v1 ); 7834 v2 = vdup_lane_f32( v2, 0 ); 7835 v1 = vadd_f32( v1, v2 ); 7836 const __n64 zero = vdup_n_u32(0); 7837 __n64 VEqualsZero = vceq_f32( v1, zero ); 7838 // Sqrt (estimate) 7839 __n64 Result = vrsqrte_f32( v1 ); 7840 Result = vmul_f32( v1, Result ); 7841 Result = vbsl_f32( VEqualsZero, zero, Result ); 7842 return vcombine_f32( Result, Result ); 7843#elif defined(_XM_SSE_INTRINSICS_) 7844 // Perform the dot product on x,y and z 7845 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 7846 // vTemp has z and y 7847 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); 7848 // x+z, y 7849 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 7850 // y,y,y,y 7851 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); 7852 // x+z+y,??,??,?? 7853 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 7854 // Splat the length squared 7855 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 7856 // Get the length 7857 vLengthSq = _mm_sqrt_ps(vLengthSq); 7858 return vLengthSq; 7859#else // _XM_VMX128_INTRINSICS_ 7860#endif // _XM_VMX128_INTRINSICS_ 7861} 7862 7863//------------------------------------------------------------------------------ 7864 7865inline XMVECTOR XMVector3Length 7866( 7867 FXMVECTOR V 7868) 7869{ 7870#if defined(_XM_NO_INTRINSICS_) 7871 7872 XMVECTOR Result; 7873 7874 Result = XMVector3LengthSq(V); 7875 Result = XMVectorSqrt(Result); 7876 7877 return Result; 7878 7879#elif defined(_XM_ARM_NEON_INTRINSICS_) 7880 // Dot3 7881 __n128 vTemp = vmulq_f32( V, V ); 7882 __n64 v1 = vget_low_f32( vTemp ); 7883 __n64 v2 = vget_high_f32( vTemp ); 7884 v1 = vpadd_f32( v1, v1 ); 7885 v2 = vdup_lane_f32( v2, 0 ); 7886 v1 = vadd_f32( v1, v2 ); 7887 const __n64 zero = vdup_n_u32(0); 7888 __n64 VEqualsZero = vceq_f32( v1, zero ); 7889 // Sqrt 7890 __n64 S0 = vrsqrte_f32( v1 ); 7891 __n64 P0 = vmul_f32( v1, S0 ); 7892 __n64 R0 = vrsqrts_f32( P0, S0 ); 7893 __n64 S1 = vmul_f32( S0, R0 ); 7894 __n64 P1 = vmul_f32( v1, S1 ); 7895 __n64 R1 = vrsqrts_f32( P1, S1 ); 7896 __n64 Result = vmul_f32( S1, R1 ); 7897 Result = vmul_f32( v1, Result ); 7898 Result = vbsl_f32( VEqualsZero, zero, Result ); 7899 return vcombine_f32( Result, Result ); 7900#elif defined(_XM_SSE_INTRINSICS_) 7901 // Perform the dot product on x,y and z 7902 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 7903 // vTemp has z and y 7904 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2)); 7905 // x+z, y 7906 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 7907 // y,y,y,y 7908 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); 7909 // x+z+y,??,??,?? 7910 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 7911 // Splat the length squared 7912 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 7913 // Get the length 7914 vLengthSq = _mm_sqrt_ps(vLengthSq); 7915 return vLengthSq; 7916#else // _XM_VMX128_INTRINSICS_ 7917#endif // _XM_VMX128_INTRINSICS_ 7918} 7919 7920//------------------------------------------------------------------------------ 7921// XMVector3NormalizeEst uses a reciprocal estimate and 7922// returns QNaN on zero and infinite vectors. 7923 7924inline XMVECTOR XMVector3NormalizeEst 7925( 7926 FXMVECTOR V 7927) 7928{ 7929#if defined(_XM_NO_INTRINSICS_) 7930 7931 XMVECTOR Result; 7932 Result = XMVector3ReciprocalLength(V); 7933 Result = XMVectorMultiply(V, Result); 7934 return Result; 7935 7936#elif defined(_XM_ARM_NEON_INTRINSICS_) 7937 // Dot3 7938 __n128 vTemp = vmulq_f32( V, V ); 7939 __n64 v1 = vget_low_f32( vTemp ); 7940 __n64 v2 = vget_high_f32( vTemp ); 7941 v1 = vpadd_f32( v1, v1 ); 7942 v2 = vdup_lane_f32( v2, 0 ); 7943 v1 = vadd_f32( v1, v2 ); 7944 // Reciprocal sqrt (estimate) 7945 v2 = vrsqrte_f32( v1 ); 7946 // Normalize 7947 return vmulq_f32( V, vcombine_f32(v2,v2) ); 7948#elif defined(_XM_SSE_INTRINSICS_) 7949 // Perform the dot product 7950 XMVECTOR vDot = _mm_mul_ps(V,V); 7951 // x=Dot.y, y=Dot.z 7952 XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1)); 7953 // Result.x = x+y 7954 vDot = _mm_add_ss(vDot,vTemp); 7955 // x=Dot.z 7956 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); 7957 // Result.x = (x+y)+z 7958 vDot = _mm_add_ss(vDot,vTemp); 7959 // Splat x 7960 vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0)); 7961 // Get the reciprocal 7962 vDot = _mm_rsqrt_ps(vDot); 7963 // Perform the normalization 7964 vDot = _mm_mul_ps(vDot,V); 7965 return vDot; 7966#else // _XM_VMX128_INTRINSICS_ 7967#endif // _XM_VMX128_INTRINSICS_ 7968} 7969 7970//------------------------------------------------------------------------------ 7971 7972inline XMVECTOR XMVector3Normalize 7973( 7974 FXMVECTOR V 7975) 7976{ 7977#if defined(_XM_NO_INTRINSICS_) 7978 float fLength; 7979 XMVECTOR vResult; 7980 7981 vResult = XMVector3Length( V ); 7982 fLength = vResult.vector4_f32[0]; 7983 7984 // Prevent divide by zero 7985 if (fLength > 0) { 7986 fLength = 1.0f/fLength; 7987 } 7988 7989 vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; 7990 vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; 7991 vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; 7992 vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; 7993 return vResult; 7994 7995#elif defined(_XM_ARM_NEON_INTRINSICS_) 7996 // Dot3 7997 __n128 vTemp = vmulq_f32( V, V ); 7998 __n64 v1 = vget_low_f32( vTemp ); 7999 __n64 v2 = vget_high_f32( vTemp ); 8000 v1 = vpadd_f32( v1, v1 ); 8001 v2 = vdup_lane_f32( v2, 0 ); 8002 v1 = vadd_f32( v1, v2 ); 8003 __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) ); 8004 __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); 8005 // Reciprocal sqrt (2 iterations of Newton-Raphson) 8006 __n64 S0 = vrsqrte_f32( v1 ); 8007 __n64 P0 = vmul_f32( v1, S0 ); 8008 __n64 R0 = vrsqrts_f32( P0, S0 ); 8009 __n64 S1 = vmul_f32( S0, R0 ); 8010 __n64 P1 = vmul_f32( v1, S1 ); 8011 __n64 R1 = vrsqrts_f32( P1, S1 ); 8012 v2 = vmul_f32( S1, R1 ); 8013 // Normalize 8014 __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); 8015 vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); 8016 return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); 8017#elif defined(_XM_SSE_INTRINSICS_) 8018 // Perform the dot product on x,y and z only 8019 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 8020 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1)); 8021 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 8022 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1)); 8023 vLengthSq = _mm_add_ss(vLengthSq,vTemp); 8024 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0)); 8025 // Prepare for the division 8026 XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); 8027 // Create zero with a single instruction 8028 XMVECTOR vZeroMask = _mm_setzero_ps(); 8029 // Test for a divide by zero (Must be FP to detect -0.0) 8030 vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); 8031 // Failsafe on zero (Or epsilon) length planes 8032 // If the length is infinity, set the elements to zero 8033 vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); 8034 // Divide to perform the normalization 8035 vResult = _mm_div_ps(V,vResult); 8036 // Any that are infinity, set to zero 8037 vResult = _mm_and_ps(vResult,vZeroMask); 8038 // Select qnan or result based on infinite length 8039 XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); 8040 XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); 8041 vResult = _mm_or_ps(vTemp1,vTemp2); 8042 return vResult; 8043#else // _XM_VMX128_INTRINSICS_ 8044#endif // _XM_VMX128_INTRINSICS_ 8045} 8046 8047//------------------------------------------------------------------------------ 8048 8049inline XMVECTOR XMVector3ClampLength 8050( 8051 FXMVECTOR V, 8052 float LengthMin, 8053 float LengthMax 8054) 8055{ 8056 XMVECTOR ClampMax = XMVectorReplicate(LengthMax); 8057 XMVECTOR ClampMin = XMVectorReplicate(LengthMin); 8058 8059 return XMVector3ClampLengthV(V, ClampMin, ClampMax); 8060} 8061 8062//------------------------------------------------------------------------------ 8063 8064inline XMVECTOR XMVector3ClampLengthV 8065( 8066 FXMVECTOR V, 8067 FXMVECTOR LengthMin, 8068 FXMVECTOR LengthMax 8069) 8070{ 8071 assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin))); 8072 assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax))); 8073 assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero())); 8074 assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero())); 8075 assert(XMVector3GreaterOrEqual(LengthMax, LengthMin)); 8076 8077 XMVECTOR LengthSq = XMVector3LengthSq(V); 8078 8079 const XMVECTOR Zero = XMVectorZero(); 8080 8081 XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); 8082 8083 XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); 8084 XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); 8085 8086 XMVECTOR Normal = XMVectorMultiply(V, RcpLength); 8087 8088 XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); 8089 8090 XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); 8091 Length = XMVectorSelect(LengthSq, Length, Select); 8092 Normal = XMVectorSelect(LengthSq, Normal, Select); 8093 8094 XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); 8095 XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); 8096 8097 XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); 8098 ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); 8099 8100 XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); 8101 8102 // Preserve the original vector (with no precision loss) if the length falls within the given range 8103 XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); 8104 Result = XMVectorSelect(Result, V, Control); 8105 8106 return Result; 8107} 8108 8109//------------------------------------------------------------------------------ 8110 8111inline XMVECTOR XMVector3Reflect 8112( 8113 FXMVECTOR Incident, 8114 FXMVECTOR Normal 8115) 8116{ 8117 // Result = Incident - (2 * dot(Incident, Normal)) * Normal 8118 8119 XMVECTOR Result = XMVector3Dot(Incident, Normal); 8120 Result = XMVectorAdd(Result, Result); 8121 Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); 8122 8123 return Result; 8124} 8125 8126//------------------------------------------------------------------------------ 8127 8128inline XMVECTOR XMVector3Refract 8129( 8130 FXMVECTOR Incident, 8131 FXMVECTOR Normal, 8132 float RefractionIndex 8133) 8134{ 8135 XMVECTOR Index = XMVectorReplicate(RefractionIndex); 8136 return XMVector3RefractV(Incident, Normal, Index); 8137} 8138 8139//------------------------------------------------------------------------------ 8140 8141inline XMVECTOR XMVector3RefractV 8142( 8143 FXMVECTOR Incident, 8144 FXMVECTOR Normal, 8145 FXMVECTOR RefractionIndex 8146) 8147{ 8148 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 8149 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) 8150 8151#if defined(_XM_NO_INTRINSICS_) 8152 8153 const XMVECTOR Zero = XMVectorZero(); 8154 8155 XMVECTOR IDotN = XMVector3Dot(Incident, Normal); 8156 8157 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 8158 XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); 8159 R = XMVectorMultiply(R, RefractionIndex); 8160 R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); 8161 8162 if (XMVector4LessOrEqual(R, Zero)) 8163 { 8164 // Total internal reflection 8165 return Zero; 8166 } 8167 else 8168 { 8169 // R = RefractionIndex * IDotN + sqrt(R) 8170 R = XMVectorSqrt(R); 8171 R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); 8172 8173 // Result = RefractionIndex * Incident - Normal * R 8174 XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident); 8175 Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); 8176 8177 return Result; 8178 } 8179 8180#elif defined(_XM_ARM_NEON_INTRINSICS_) 8181 XMVECTOR IDotN = XMVector3Dot(Incident,Normal); 8182 8183 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 8184 __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN); 8185 R = vmulq_f32(R, RefractionIndex); 8186 R = vmlsq_f32(g_XMOne, R, RefractionIndex ); 8187 8188 __n128 vResult = vcleq_f32(R,g_XMZero); 8189 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 8190 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 8191 if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) 8192 { 8193 // Total internal reflection 8194 vResult = g_XMZero; 8195 } 8196 else 8197 { 8198 // Sqrt(R) 8199 __n128 S0 = vrsqrteq_f32(R); 8200 __n128 P0 = vmulq_f32( R, S0 ); 8201 __n128 R0 = vrsqrtsq_f32( P0, S0 ); 8202 __n128 S1 = vmulq_f32( S0, R0 ); 8203 __n128 P1 = vmulq_f32( R, S1 ); 8204 __n128 R1 = vrsqrtsq_f32( P1, S1 ); 8205 __n128 S2 = vmulq_f32( S1, R1 ); 8206 R = vmulq_f32( R, S2 ); 8207 // R = RefractionIndex * IDotN + sqrt(R) 8208 R = vmlaq_f32( R, RefractionIndex, IDotN ); 8209 // Result = RefractionIndex * Incident - Normal * R 8210 vResult = vmulq_f32(RefractionIndex, Incident); 8211 vResult = vmlsq_f32( vResult, R, Normal ); 8212 } 8213 return vResult; 8214#elif defined(_XM_SSE_INTRINSICS_) 8215 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 8216 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) 8217 XMVECTOR IDotN = XMVector3Dot(Incident, Normal); 8218 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 8219 XMVECTOR R = _mm_mul_ps(IDotN, IDotN); 8220 R = _mm_sub_ps(g_XMOne,R); 8221 R = _mm_mul_ps(R, RefractionIndex); 8222 R = _mm_mul_ps(R, RefractionIndex); 8223 R = _mm_sub_ps(g_XMOne,R); 8224 8225 XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); 8226 if (_mm_movemask_ps(vResult)==0x0f) 8227 { 8228 // Total internal reflection 8229 vResult = g_XMZero; 8230 } 8231 else 8232 { 8233 // R = RefractionIndex * IDotN + sqrt(R) 8234 R = _mm_sqrt_ps(R); 8235 vResult = _mm_mul_ps(RefractionIndex,IDotN); 8236 R = _mm_add_ps(R,vResult); 8237 // Result = RefractionIndex * Incident - Normal * R 8238 vResult = _mm_mul_ps(RefractionIndex, Incident); 8239 R = _mm_mul_ps(R,Normal); 8240 vResult = _mm_sub_ps(vResult,R); 8241 } 8242 return vResult; 8243#else // _XM_VMX128_INTRINSICS_ 8244#endif // _XM_VMX128_INTRINSICS_ 8245} 8246 8247//------------------------------------------------------------------------------ 8248 8249inline XMVECTOR XMVector3Orthogonal 8250( 8251 FXMVECTOR V 8252) 8253{ 8254#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8255 8256 XMVECTOR Zero = XMVectorZero(); 8257 XMVECTOR Z = XMVectorSplatZ(V); 8258 XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V); 8259 8260 XMVECTOR NegativeV = XMVectorSubtract(Zero, V); 8261 8262 XMVECTOR ZIsNegative = XMVectorLess(Z, Zero); 8263 XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero); 8264 8265 XMVECTOR S = XMVectorAdd(YZYY, Z); 8266 XMVECTOR D = XMVectorSubtract(YZYY, Z); 8267 8268 XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative); 8269 8270 XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S); 8271 XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D); 8272 8273 return XMVectorSelect(R1, R0, Select); 8274 8275#else // _XM_VMX128_INTRINSICS_ 8276#endif // _XM_VMX128_INTRINSICS_ 8277} 8278 8279//------------------------------------------------------------------------------ 8280 8281inline XMVECTOR XMVector3AngleBetweenNormalsEst 8282( 8283 FXMVECTOR N1, 8284 FXMVECTOR N2 8285) 8286{ 8287#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8288 8289 XMVECTOR Result = XMVector3Dot(N1, N2); 8290 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); 8291 Result = XMVectorACosEst(Result); 8292 return Result; 8293 8294#else // _XM_VMX128_INTRINSICS_ 8295#endif // _XM_VMX128_INTRINSICS_ 8296} 8297 8298//------------------------------------------------------------------------------ 8299 8300inline XMVECTOR XMVector3AngleBetweenNormals 8301( 8302 FXMVECTOR N1, 8303 FXMVECTOR N2 8304) 8305{ 8306#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8307 8308 XMVECTOR Result = XMVector3Dot(N1, N2); 8309 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); 8310 Result = XMVectorACos(Result); 8311 return Result; 8312 8313#else // _XM_VMX128_INTRINSICS_ 8314#endif // _XM_VMX128_INTRINSICS_ 8315} 8316 8317//------------------------------------------------------------------------------ 8318 8319inline XMVECTOR XMVector3AngleBetweenVectors 8320( 8321 FXMVECTOR V1, 8322 FXMVECTOR V2 8323) 8324{ 8325#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8326 8327 XMVECTOR L1 = XMVector3ReciprocalLength(V1); 8328 XMVECTOR L2 = XMVector3ReciprocalLength(V2); 8329 8330 XMVECTOR Dot = XMVector3Dot(V1, V2); 8331 8332 L1 = XMVectorMultiply(L1, L2); 8333 8334 XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); 8335 CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); 8336 8337 return XMVectorACos(CosAngle); 8338 8339#else // _XM_VMX128_INTRINSICS_ 8340#endif // _XM_VMX128_INTRINSICS_ 8341} 8342 8343//------------------------------------------------------------------------------ 8344 8345inline XMVECTOR XMVector3LinePointDistance 8346( 8347 FXMVECTOR LinePoint1, 8348 FXMVECTOR LinePoint2, 8349 FXMVECTOR Point 8350) 8351{ 8352 // Given a vector PointVector from LinePoint1 to Point and a vector 8353 // LineVector from LinePoint1 to LinePoint2, the scaled distance 8354 // PointProjectionScale from LinePoint1 to the perpendicular projection 8355 // of PointVector onto the line is defined as: 8356 // 8357 // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector) 8358 8359#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8360 8361 XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1); 8362 XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1); 8363 8364 XMVECTOR LengthSq = XMVector3LengthSq(LineVector); 8365 8366 XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector); 8367 PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq); 8368 8369 XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale); 8370 DistanceVector = XMVectorSubtract(PointVector, DistanceVector); 8371 8372 return XMVector3Length(DistanceVector); 8373 8374#else // _XM_VMX128_INTRINSICS_ 8375#endif // _XM_VMX128_INTRINSICS_ 8376} 8377 8378//------------------------------------------------------------------------------ 8379 8380_Use_decl_annotations_ 8381inline void XMVector3ComponentsFromNormal 8382( 8383 XMVECTOR* pParallel, 8384 XMVECTOR* pPerpendicular, 8385 FXMVECTOR V, 8386 FXMVECTOR Normal 8387) 8388{ 8389 assert(pParallel != NULL); 8390 assert(pPerpendicular != NULL); 8391 8392 XMVECTOR Scale = XMVector3Dot(V, Normal); 8393 8394 XMVECTOR Parallel = XMVectorMultiply(Normal, Scale); 8395 8396 *pParallel = Parallel; 8397 *pPerpendicular = XMVectorSubtract(V, Parallel); 8398} 8399 8400//------------------------------------------------------------------------------ 8401// Transform a vector using a rotation expressed as a unit quaternion 8402 8403inline XMVECTOR XMVector3Rotate 8404( 8405 FXMVECTOR V, 8406 FXMVECTOR RotationQuaternion 8407) 8408{ 8409#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8410 8411 XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); 8412 XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); 8413 XMVECTOR Result = XMQuaternionMultiply(Q, A); 8414 return XMQuaternionMultiply(Result, RotationQuaternion); 8415 8416#else // _XM_VMX128_INTRINSICS_ 8417#endif // _XM_VMX128_INTRINSICS_ 8418} 8419 8420//------------------------------------------------------------------------------ 8421// Transform a vector using the inverse of a rotation expressed as a unit quaternion 8422 8423inline XMVECTOR XMVector3InverseRotate 8424( 8425 FXMVECTOR V, 8426 FXMVECTOR RotationQuaternion 8427) 8428{ 8429#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8430 8431 XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v); 8432 XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A); 8433 XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion); 8434 return XMQuaternionMultiply(Result, Q); 8435 8436#else // _XM_VMX128_INTRINSICS_ 8437#endif // _XM_VMX128_INTRINSICS_ 8438} 8439 8440//------------------------------------------------------------------------------ 8441 8442inline XMVECTOR XMVector3Transform 8443( 8444 FXMVECTOR V, 8445 CXMMATRIX M 8446) 8447{ 8448#if defined(_XM_NO_INTRINSICS_) 8449 8450 XMVECTOR Z = XMVectorSplatZ(V); 8451 XMVECTOR Y = XMVectorSplatY(V); 8452 XMVECTOR X = XMVectorSplatX(V); 8453 8454 XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); 8455 Result = XMVectorMultiplyAdd(Y, M.r[1], Result); 8456 Result = XMVectorMultiplyAdd(X, M.r[0], Result); 8457 8458 return Result; 8459 8460#elif defined(_XM_ARM_NEON_INTRINSICS_) 8461 __n64 VL = vget_low_f32( V ); 8462 XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X 8463 XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y 8464 vResult = vmlaq_f32( M.r[3], vResult, M.r[0] ); 8465 vResult = vmlaq_f32( vResult, vTemp, M.r[1] ); 8466 vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z 8467 return vmlaq_f32( vResult, vTemp, M.r[2] ); 8468#elif defined(_XM_SSE_INTRINSICS_) 8469 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); 8470 vResult = _mm_mul_ps(vResult,M.r[0]); 8471 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 8472 vTemp = _mm_mul_ps(vTemp,M.r[1]); 8473 vResult = _mm_add_ps(vResult,vTemp); 8474 vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 8475 vTemp = _mm_mul_ps(vTemp,M.r[2]); 8476 vResult = _mm_add_ps(vResult,vTemp); 8477 vResult = _mm_add_ps(vResult,M.r[3]); 8478 return vResult; 8479#else // _XM_VMX128_INTRINSICS_ 8480#endif // _XM_VMX128_INTRINSICS_ 8481} 8482 8483//------------------------------------------------------------------------------ 8484 8485_Use_decl_annotations_ 8486inline XMFLOAT4* XMVector3TransformStream 8487( 8488 XMFLOAT4* pOutputStream, 8489 size_t OutputStride, 8490 const XMFLOAT3* pInputStream, 8491 size_t InputStride, 8492 size_t VectorCount, 8493 CXMMATRIX M 8494) 8495{ 8496 assert(pOutputStream != NULL); 8497 assert(pInputStream != NULL); 8498 8499#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8500 8501 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 8502 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 8503 8504 const XMVECTOR row0 = M.r[0]; 8505 const XMVECTOR row1 = M.r[1]; 8506 const XMVECTOR row2 = M.r[2]; 8507 const XMVECTOR row3 = M.r[3]; 8508 8509 for (size_t i = 0; i < VectorCount; i++) 8510 { 8511 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); 8512 XMVECTOR Z = XMVectorSplatZ(V); 8513 XMVECTOR Y = XMVectorSplatY(V); 8514 XMVECTOR X = XMVectorSplatX(V); 8515 8516 XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); 8517 Result = XMVectorMultiplyAdd(Y, row1, Result); 8518 Result = XMVectorMultiplyAdd(X, row0, Result); 8519 8520 XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); 8521 8522 pInputVector += InputStride; 8523 pOutputVector += OutputStride; 8524 } 8525 8526 return pOutputStream; 8527 8528#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 8529#endif // _XM_VMX128_INTRINSICS_ 8530} 8531 8532 8533//------------------------------------------------------------------------------ 8534 8535inline XMVECTOR XMVector3TransformCoord 8536( 8537 FXMVECTOR V, 8538 CXMMATRIX M 8539) 8540{ 8541#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8542 8543 XMVECTOR Z = XMVectorSplatZ(V); 8544 XMVECTOR Y = XMVectorSplatY(V); 8545 XMVECTOR X = XMVectorSplatX(V); 8546 8547 XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]); 8548 Result = XMVectorMultiplyAdd(Y, M.r[1], Result); 8549 Result = XMVectorMultiplyAdd(X, M.r[0], Result); 8550 8551 XMVECTOR W = XMVectorSplatW(Result); 8552 return XMVectorDivide( Result, W ); 8553 8554#else // _XM_VMX128_INTRINSICS_ 8555#endif // _XM_VMX128_INTRINSICS_ 8556} 8557 8558//------------------------------------------------------------------------------ 8559 8560_Use_decl_annotations_ 8561inline XMFLOAT3* XMVector3TransformCoordStream 8562( 8563 XMFLOAT3* pOutputStream, 8564 size_t OutputStride, 8565 const XMFLOAT3* pInputStream, 8566 size_t InputStride, 8567 size_t VectorCount, 8568 CXMMATRIX M 8569) 8570{ 8571 assert(pOutputStream != NULL); 8572 assert(pInputStream != NULL); 8573 8574#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8575 8576 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 8577 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 8578 8579 const XMVECTOR row0 = M.r[0]; 8580 const XMVECTOR row1 = M.r[1]; 8581 const XMVECTOR row2 = M.r[2]; 8582 const XMVECTOR row3 = M.r[3]; 8583 8584 for (size_t i = 0; i < VectorCount; i++) 8585 { 8586 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); 8587 XMVECTOR Z = XMVectorSplatZ(V); 8588 XMVECTOR Y = XMVectorSplatY(V); 8589 XMVECTOR X = XMVectorSplatX(V); 8590 8591 XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3); 8592 Result = XMVectorMultiplyAdd(Y, row1, Result); 8593 Result = XMVectorMultiplyAdd(X, row0, Result); 8594 8595 XMVECTOR W = XMVectorSplatW(Result); 8596 8597 Result = XMVectorDivide(Result, W); 8598 8599 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); 8600 8601 pInputVector += InputStride; 8602 pOutputVector += OutputStride; 8603 } 8604 8605 return pOutputStream; 8606 8607#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 8608#endif // _XM_VMX128_INTRINSICS_ 8609} 8610 8611//------------------------------------------------------------------------------ 8612 8613inline XMVECTOR XMVector3TransformNormal 8614( 8615 FXMVECTOR V, 8616 CXMMATRIX M 8617) 8618{ 8619#if defined(_XM_NO_INTRINSICS_) 8620 8621 XMVECTOR Z = XMVectorSplatZ(V); 8622 XMVECTOR Y = XMVectorSplatY(V); 8623 XMVECTOR X = XMVectorSplatX(V); 8624 8625 XMVECTOR Result = XMVectorMultiply(Z, M.r[2]); 8626 Result = XMVectorMultiplyAdd(Y, M.r[1], Result); 8627 Result = XMVectorMultiplyAdd(X, M.r[0], Result); 8628 8629 return Result; 8630 8631#elif defined(_XM_ARM_NEON_INTRINSICS_) 8632 __n64 VL = vget_low_f32( V ); 8633 XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X 8634 XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y 8635 vResult = vmulq_f32( vResult, M.r[0] ); 8636 vResult = vmlaq_f32( vResult, vTemp, M.r[1] ); 8637 vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z 8638 return vmlaq_f32( vResult, vTemp, M.r[2] ); 8639#elif defined(_XM_SSE_INTRINSICS_) 8640 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); 8641 vResult = _mm_mul_ps(vResult,M.r[0]); 8642 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 8643 vTemp = _mm_mul_ps(vTemp,M.r[1]); 8644 vResult = _mm_add_ps(vResult,vTemp); 8645 vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 8646 vTemp = _mm_mul_ps(vTemp,M.r[2]); 8647 vResult = _mm_add_ps(vResult,vTemp); 8648 return vResult; 8649#else // _XM_VMX128_INTRINSICS_ 8650#endif // _XM_VMX128_INTRINSICS_ 8651} 8652 8653//------------------------------------------------------------------------------ 8654 8655_Use_decl_annotations_ 8656inline XMFLOAT3* XMVector3TransformNormalStream 8657( 8658 XMFLOAT3* pOutputStream, 8659 size_t OutputStride, 8660 const XMFLOAT3* pInputStream, 8661 size_t InputStride, 8662 size_t VectorCount, 8663 CXMMATRIX M 8664) 8665{ 8666 assert(pOutputStream != NULL); 8667 assert(pInputStream != NULL); 8668 8669#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8670 8671 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 8672 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 8673 8674 const XMVECTOR row0 = M.r[0]; 8675 const XMVECTOR row1 = M.r[1]; 8676 const XMVECTOR row2 = M.r[2]; 8677 8678 for (size_t i = 0; i < VectorCount; i++) 8679 { 8680 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); 8681 XMVECTOR Z = XMVectorSplatZ(V); 8682 XMVECTOR Y = XMVectorSplatY(V); 8683 XMVECTOR X = XMVectorSplatX(V); 8684 8685 XMVECTOR Result = XMVectorMultiply(Z, row2); 8686 Result = XMVectorMultiplyAdd(Y, row1, Result); 8687 Result = XMVectorMultiplyAdd(X, row0, Result); 8688 8689 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); 8690 8691 pInputVector += InputStride; 8692 pOutputVector += OutputStride; 8693 } 8694 8695 return pOutputStream; 8696 8697#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 8698#endif // _XM_VMX128_INTRINSICS_ 8699} 8700 8701//------------------------------------------------------------------------------ 8702 8703inline XMVECTOR XMVector3Project 8704( 8705 FXMVECTOR V, 8706 float ViewportX, 8707 float ViewportY, 8708 float ViewportWidth, 8709 float ViewportHeight, 8710 float ViewportMinZ, 8711 float ViewportMaxZ, 8712 CXMMATRIX Projection, 8713 CXMMATRIX View, 8714 CXMMATRIX World 8715) 8716{ 8717#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8718 8719 const float HalfViewportWidth = ViewportWidth * 0.5f; 8720 const float HalfViewportHeight = ViewportHeight * 0.5f; 8721 8722 XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f); 8723 XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); 8724 8725 XMMATRIX Transform = XMMatrixMultiply(World, View); 8726 Transform = XMMatrixMultiply(Transform, Projection); 8727 8728 XMVECTOR Result = XMVector3TransformCoord(V, Transform); 8729 8730 Result = XMVectorMultiplyAdd(Result, Scale, Offset); 8731 8732 return Result; 8733 8734#else // _XM_VMX128_INTRINSICS_ 8735#endif // _XM_VMX128_INTRINSICS_ 8736} 8737 8738//------------------------------------------------------------------------------ 8739 8740_Use_decl_annotations_ 8741inline XMFLOAT3* XMVector3ProjectStream 8742( 8743 XMFLOAT3* pOutputStream, 8744 size_t OutputStride, 8745 const XMFLOAT3* pInputStream, 8746 size_t InputStride, 8747 size_t VectorCount, 8748 float ViewportX, 8749 float ViewportY, 8750 float ViewportWidth, 8751 float ViewportHeight, 8752 float ViewportMinZ, 8753 float ViewportMaxZ, 8754 CXMMATRIX Projection, 8755 CXMMATRIX View, 8756 CXMMATRIX World 8757) 8758{ 8759 assert(pOutputStream != NULL); 8760 assert(pInputStream != NULL); 8761 8762#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) 8763 8764 const float HalfViewportWidth = ViewportWidth * 0.5f; 8765 const float HalfViewportHeight = ViewportHeight * 0.5f; 8766 8767 XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f); 8768 XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f); 8769 8770 XMMATRIX Transform = XMMatrixMultiply(World, View); 8771 Transform = XMMatrixMultiply(Transform, Projection); 8772 8773 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 8774 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 8775 8776 for (size_t i = 0; i < VectorCount; i++) 8777 { 8778 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); 8779 8780 XMVECTOR Result = XMVector3TransformCoord(V, Transform); 8781 Result = XMVectorMultiplyAdd(Result, Scale, Offset); 8782 8783 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); 8784 8785 pInputVector += InputStride; 8786 pOutputVector += OutputStride; 8787 } 8788 8789 return pOutputStream; 8790 8791#else // _XM_VMX128_INTRINSICS_ 8792#endif // _XM_VMX128_INTRINSICS_ 8793} 8794 8795//------------------------------------------------------------------------------ 8796 8797inline XMVECTOR XMVector3Unproject 8798( 8799 FXMVECTOR V, 8800 float ViewportX, 8801 float ViewportY, 8802 float ViewportWidth, 8803 float ViewportHeight, 8804 float ViewportMinZ, 8805 float ViewportMaxZ, 8806 CXMMATRIX Projection, 8807 CXMMATRIX View, 8808 CXMMATRIX World 8809) 8810{ 8811#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 8812 8813 static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; 8814 8815 XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); 8816 Scale = XMVectorReciprocal(Scale); 8817 8818 XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); 8819 Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); 8820 8821 XMMATRIX Transform = XMMatrixMultiply(World, View); 8822 Transform = XMMatrixMultiply(Transform, Projection); 8823 Transform = XMMatrixInverse(NULL, Transform); 8824 8825 XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); 8826 8827 return XMVector3TransformCoord(Result, Transform); 8828 8829#else // _XM_VMX128_INTRINSICS_ 8830#endif // _XM_VMX128_INTRINSICS_ 8831} 8832 8833//------------------------------------------------------------------------------ 8834 8835_Use_decl_annotations_ 8836inline XMFLOAT3* XMVector3UnprojectStream 8837( 8838 XMFLOAT3* pOutputStream, 8839 size_t OutputStride, 8840 const XMFLOAT3* pInputStream, 8841 size_t InputStride, 8842 size_t VectorCount, 8843 float ViewportX, 8844 float ViewportY, 8845 float ViewportWidth, 8846 float ViewportHeight, 8847 float ViewportMinZ, 8848 float ViewportMaxZ, 8849 CXMMATRIX Projection, 8850 CXMMATRIX View, 8851 CXMMATRIX World) 8852{ 8853 assert(pOutputStream != NULL); 8854 assert(pInputStream != NULL); 8855 8856#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_) 8857 8858 static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f }; 8859 8860 XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f); 8861 Scale = XMVectorReciprocal(Scale); 8862 8863 XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f); 8864 Offset = XMVectorMultiplyAdd(Scale, Offset, D.v); 8865 8866 XMMATRIX Transform = XMMatrixMultiply(World, View); 8867 Transform = XMMatrixMultiply(Transform, Projection); 8868 Transform = XMMatrixInverse(NULL, Transform); 8869 8870 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 8871 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 8872 8873 for (size_t i = 0; i < VectorCount; i++) 8874 { 8875 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector); 8876 8877 XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset); 8878 8879 Result = XMVector3TransformCoord(Result, Transform); 8880 8881 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result); 8882 8883 pInputVector += InputStride; 8884 pOutputVector += OutputStride; 8885 } 8886 8887 return pOutputStream; 8888 8889#else // _XM_VMX128_INTRINSICS_ 8890#endif // _XM_VMX128_INTRINSICS_ 8891} 8892 8893/**************************************************************************** 8894 * 8895 * 4D Vector 8896 * 8897 ****************************************************************************/ 8898 8899//------------------------------------------------------------------------------ 8900// Comparison operations 8901//------------------------------------------------------------------------------ 8902 8903//------------------------------------------------------------------------------ 8904 8905inline bool XMVector4Equal 8906( 8907 FXMVECTOR V1, 8908 FXMVECTOR V2 8909) 8910{ 8911#if defined(_XM_NO_INTRINSICS_) 8912 return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0); 8913#elif defined(_XM_ARM_NEON_INTRINSICS_) 8914 __n128 vResult = vceqq_f32( V1, V2 ); 8915 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 8916 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 8917 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 8918#elif defined(_XM_SSE_INTRINSICS_) 8919 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 8920 return ((_mm_movemask_ps(vTemp)==0x0f) != 0); 8921#else 8922 return XMComparisonAllTrue(XMVector4EqualR(V1, V2)); 8923#endif 8924} 8925 8926//------------------------------------------------------------------------------ 8927 8928inline uint32_t XMVector4EqualR 8929( 8930 FXMVECTOR V1, 8931 FXMVECTOR V2 8932) 8933{ 8934#if defined(_XM_NO_INTRINSICS_) 8935 8936 uint32_t CR = 0; 8937 8938 if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 8939 (V1.vector4_f32[1] == V2.vector4_f32[1]) && 8940 (V1.vector4_f32[2] == V2.vector4_f32[2]) && 8941 (V1.vector4_f32[3] == V2.vector4_f32[3])) 8942 { 8943 CR = XM_CRMASK_CR6TRUE; 8944 } 8945 else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 8946 (V1.vector4_f32[1] != V2.vector4_f32[1]) && 8947 (V1.vector4_f32[2] != V2.vector4_f32[2]) && 8948 (V1.vector4_f32[3] != V2.vector4_f32[3])) 8949 { 8950 CR = XM_CRMASK_CR6FALSE; 8951 } 8952 return CR; 8953 8954#elif defined(_XM_ARM_NEON_INTRINSICS_) 8955 __n128 vResult = vceqq_f32( V1, V2 ); 8956 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 8957 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 8958 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 8959 8960 uint32_t CR = 0; 8961 if ( r == 0xFFFFFFFFU ) 8962 { 8963 CR = XM_CRMASK_CR6TRUE; 8964 } 8965 else if ( !r ) 8966 { 8967 CR = XM_CRMASK_CR6FALSE; 8968 } 8969 return CR; 8970#elif defined(_XM_SSE_INTRINSICS_) 8971 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2); 8972 int iTest = _mm_movemask_ps(vTemp); 8973 uint32_t CR = 0; 8974 if (iTest==0xf) // All equal? 8975 { 8976 CR = XM_CRMASK_CR6TRUE; 8977 } 8978 else if (iTest==0) // All not equal? 8979 { 8980 CR = XM_CRMASK_CR6FALSE; 8981 } 8982 return CR; 8983#else // _XM_VMX128_INTRINSICS_ 8984#endif // _XM_VMX128_INTRINSICS_ 8985} 8986 8987//------------------------------------------------------------------------------ 8988 8989inline bool XMVector4EqualInt 8990( 8991 FXMVECTOR V1, 8992 FXMVECTOR V2 8993) 8994{ 8995#if defined(_XM_NO_INTRINSICS_) 8996 return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0); 8997#elif defined(_XM_ARM_NEON_INTRINSICS_) 8998 __n128 vResult = vceqq_u32( V1, V2 ); 8999 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9000 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9001 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 9002#elif defined(_XM_SSE_INTRINSICS_) 9003 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 9004 return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0); 9005#else 9006 return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2)); 9007#endif 9008} 9009 9010//------------------------------------------------------------------------------ 9011 9012inline uint32_t XMVector4EqualIntR 9013( 9014 FXMVECTOR V1, 9015 FXMVECTOR V2 9016) 9017{ 9018#if defined(_XM_NO_INTRINSICS_) 9019 uint32_t CR = 0; 9020 if (V1.vector4_u32[0] == V2.vector4_u32[0] && 9021 V1.vector4_u32[1] == V2.vector4_u32[1] && 9022 V1.vector4_u32[2] == V2.vector4_u32[2] && 9023 V1.vector4_u32[3] == V2.vector4_u32[3]) 9024 { 9025 CR = XM_CRMASK_CR6TRUE; 9026 } 9027 else if (V1.vector4_u32[0] != V2.vector4_u32[0] && 9028 V1.vector4_u32[1] != V2.vector4_u32[1] && 9029 V1.vector4_u32[2] != V2.vector4_u32[2] && 9030 V1.vector4_u32[3] != V2.vector4_u32[3]) 9031 { 9032 CR = XM_CRMASK_CR6FALSE; 9033 } 9034 return CR; 9035 9036#elif defined(_XM_ARM_NEON_INTRINSICS_) 9037 __n128 vResult = vceqq_u32( V1, V2 ); 9038 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9039 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9040 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 9041 9042 uint32_t CR = 0; 9043 if ( r == 0xFFFFFFFFU ) 9044 { 9045 CR = XM_CRMASK_CR6TRUE; 9046 } 9047 else if ( !r ) 9048 { 9049 CR = XM_CRMASK_CR6FALSE; 9050 } 9051 return CR; 9052#elif defined(_XM_SSE_INTRINSICS_) 9053 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 9054 int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp)); 9055 uint32_t CR = 0; 9056 if (iTest==0xf) // All equal? 9057 { 9058 CR = XM_CRMASK_CR6TRUE; 9059 } 9060 else if (iTest==0) // All not equal? 9061 { 9062 CR = XM_CRMASK_CR6FALSE; 9063 } 9064 return CR; 9065#else // _XM_VMX128_INTRINSICS_ 9066#endif // _XM_VMX128_INTRINSICS_ 9067} 9068 9069inline bool XMVector4NearEqual 9070( 9071 FXMVECTOR V1, 9072 FXMVECTOR V2, 9073 FXMVECTOR Epsilon 9074) 9075{ 9076#if defined(_XM_NO_INTRINSICS_) 9077 float dx, dy, dz, dw; 9078 9079 dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]); 9080 dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]); 9081 dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]); 9082 dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]); 9083 return (((dx <= Epsilon.vector4_f32[0]) && 9084 (dy <= Epsilon.vector4_f32[1]) && 9085 (dz <= Epsilon.vector4_f32[2]) && 9086 (dw <= Epsilon.vector4_f32[3])) != 0); 9087#elif defined(_XM_ARM_NEON_INTRINSICS_) 9088 __n128 vDelta = vsubq_f32( V1, V2 ); 9089 __n128 vResult = vacleq_f32( vDelta, Epsilon ); 9090 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9091 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9092 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 9093#elif defined(_XM_SSE_INTRINSICS_) 9094 // Get the difference 9095 XMVECTOR vDelta = _mm_sub_ps(V1,V2); 9096 // Get the absolute value of the difference 9097 XMVECTOR vTemp = _mm_setzero_ps(); 9098 vTemp = _mm_sub_ps(vTemp,vDelta); 9099 vTemp = _mm_max_ps(vTemp,vDelta); 9100 vTemp = _mm_cmple_ps(vTemp,Epsilon); 9101 return ((_mm_movemask_ps(vTemp)==0xf) != 0); 9102#else // _XM_VMX128_INTRINSICS_ 9103#endif // _XM_VMX128_INTRINSICS_ 9104} 9105 9106//------------------------------------------------------------------------------ 9107 9108inline bool XMVector4NotEqual 9109( 9110 FXMVECTOR V1, 9111 FXMVECTOR V2 9112) 9113{ 9114#if defined(_XM_NO_INTRINSICS_) 9115 return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0); 9116#elif defined(_XM_ARM_NEON_INTRINSICS_) 9117 __n128 vResult = vceqq_f32( V1, V2 ); 9118 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9119 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9120 return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); 9121#elif defined(_XM_SSE_INTRINSICS_) 9122 XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2); 9123 return ((_mm_movemask_ps(vTemp)) != 0); 9124#else 9125 return XMComparisonAnyFalse(XMVector4EqualR(V1, V2)); 9126#endif 9127} 9128 9129//------------------------------------------------------------------------------ 9130 9131inline bool XMVector4NotEqualInt 9132( 9133 FXMVECTOR V1, 9134 FXMVECTOR V2 9135) 9136{ 9137#if defined(_XM_NO_INTRINSICS_) 9138 return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0); 9139#elif defined(_XM_ARM_NEON_INTRINSICS_) 9140 __n128 vResult = vceqq_u32( V1, V2 ); 9141 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9142 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9143 return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); 9144#elif defined(_XM_SSE_INTRINSICS_) 9145 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2)); 9146 return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0); 9147#else 9148 return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2)); 9149#endif 9150} 9151 9152//------------------------------------------------------------------------------ 9153 9154inline bool XMVector4Greater 9155( 9156 FXMVECTOR V1, 9157 FXMVECTOR V2 9158) 9159{ 9160#if defined(_XM_NO_INTRINSICS_) 9161 return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0); 9162#elif defined(_XM_ARM_NEON_INTRINSICS_) 9163 __n128 vResult = vcgtq_f32( V1, V2 ); 9164 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9165 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9166 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 9167#elif defined(_XM_SSE_INTRINSICS_) 9168 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); 9169 return ((_mm_movemask_ps(vTemp)==0x0f) != 0); 9170#else 9171 return XMComparisonAllTrue(XMVector4GreaterR(V1, V2)); 9172#endif 9173} 9174 9175//------------------------------------------------------------------------------ 9176 9177inline uint32_t XMVector4GreaterR 9178( 9179 FXMVECTOR V1, 9180 FXMVECTOR V2 9181) 9182{ 9183#if defined(_XM_NO_INTRINSICS_) 9184 uint32_t CR = 0; 9185 if (V1.vector4_f32[0] > V2.vector4_f32[0] && 9186 V1.vector4_f32[1] > V2.vector4_f32[1] && 9187 V1.vector4_f32[2] > V2.vector4_f32[2] && 9188 V1.vector4_f32[3] > V2.vector4_f32[3]) 9189 { 9190 CR = XM_CRMASK_CR6TRUE; 9191 } 9192 else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && 9193 V1.vector4_f32[1] <= V2.vector4_f32[1] && 9194 V1.vector4_f32[2] <= V2.vector4_f32[2] && 9195 V1.vector4_f32[3] <= V2.vector4_f32[3]) 9196 { 9197 CR = XM_CRMASK_CR6FALSE; 9198 } 9199 return CR; 9200 9201#elif defined(_XM_ARM_NEON_INTRINSICS_) 9202 __n128 vResult = vcgtq_f32( V1, V2 ); 9203 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9204 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9205 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 9206 9207 uint32_t CR = 0; 9208 if ( r == 0xFFFFFFFFU ) 9209 { 9210 CR = XM_CRMASK_CR6TRUE; 9211 } 9212 else if ( !r ) 9213 { 9214 CR = XM_CRMASK_CR6FALSE; 9215 } 9216 return CR; 9217#elif defined(_XM_SSE_INTRINSICS_) 9218 uint32_t CR = 0; 9219 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2); 9220 int iTest = _mm_movemask_ps(vTemp); 9221 if (iTest==0xf) { 9222 CR = XM_CRMASK_CR6TRUE; 9223 } 9224 else if (!iTest) 9225 { 9226 CR = XM_CRMASK_CR6FALSE; 9227 } 9228 return CR; 9229#else // _XM_VMX128_INTRINSICS_ 9230#endif // _XM_VMX128_INTRINSICS_ 9231} 9232 9233//------------------------------------------------------------------------------ 9234 9235inline bool XMVector4GreaterOrEqual 9236( 9237 FXMVECTOR V1, 9238 FXMVECTOR V2 9239) 9240{ 9241#if defined(_XM_NO_INTRINSICS_) 9242 return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0); 9243#elif defined(_XM_ARM_NEON_INTRINSICS_) 9244 __n128 vResult = vcgeq_f32( V1, V2 ); 9245 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9246 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9247 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 9248#elif defined(_XM_SSE_INTRINSICS_) 9249 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); 9250 return ((_mm_movemask_ps(vTemp)==0x0f) != 0); 9251#else 9252 return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2)); 9253#endif 9254} 9255 9256//------------------------------------------------------------------------------ 9257 9258inline uint32_t XMVector4GreaterOrEqualR 9259( 9260 FXMVECTOR V1, 9261 FXMVECTOR V2 9262) 9263{ 9264#if defined(_XM_NO_INTRINSICS_) 9265 uint32_t CR = 0; 9266 if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 9267 (V1.vector4_f32[1] >= V2.vector4_f32[1]) && 9268 (V1.vector4_f32[2] >= V2.vector4_f32[2]) && 9269 (V1.vector4_f32[3] >= V2.vector4_f32[3])) 9270 { 9271 CR = XM_CRMASK_CR6TRUE; 9272 } 9273 else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 9274 (V1.vector4_f32[1] < V2.vector4_f32[1]) && 9275 (V1.vector4_f32[2] < V2.vector4_f32[2]) && 9276 (V1.vector4_f32[3] < V2.vector4_f32[3])) 9277 { 9278 CR = XM_CRMASK_CR6FALSE; 9279 } 9280 return CR; 9281 9282#elif defined(_XM_ARM_NEON_INTRINSICS_) 9283 __n128 vResult = vcgeq_f32( V1, V2 ); 9284 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9285 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9286 uint32_t r = vget_lane_u32(vTemp.val[1], 1); 9287 9288 uint32_t CR = 0; 9289 if ( r == 0xFFFFFFFFU ) 9290 { 9291 CR = XM_CRMASK_CR6TRUE; 9292 } 9293 else if ( !r ) 9294 { 9295 CR = XM_CRMASK_CR6FALSE; 9296 } 9297 return CR; 9298#elif defined(_XM_SSE_INTRINSICS_) 9299 uint32_t CR = 0; 9300 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2); 9301 int iTest = _mm_movemask_ps(vTemp); 9302 if (iTest==0x0f) 9303 { 9304 CR = XM_CRMASK_CR6TRUE; 9305 } 9306 else if (!iTest) 9307 { 9308 CR = XM_CRMASK_CR6FALSE; 9309 } 9310 return CR; 9311#else // _XM_VMX128_INTRINSICS_ 9312#endif // _XM_VMX128_INTRINSICS_ 9313} 9314 9315//------------------------------------------------------------------------------ 9316 9317inline bool XMVector4Less 9318( 9319 FXMVECTOR V1, 9320 FXMVECTOR V2 9321) 9322{ 9323#if defined(_XM_NO_INTRINSICS_) 9324 return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0); 9325#elif defined(_XM_ARM_NEON_INTRINSICS_) 9326 __n128 vResult = vcltq_f32( V1, V2 ); 9327 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9328 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9329 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 9330#elif defined(_XM_SSE_INTRINSICS_) 9331 XMVECTOR vTemp = _mm_cmplt_ps(V1,V2); 9332 return ((_mm_movemask_ps(vTemp)==0x0f) != 0); 9333#else 9334 return XMComparisonAllTrue(XMVector4GreaterR(V2, V1)); 9335#endif 9336} 9337 9338//------------------------------------------------------------------------------ 9339 9340inline bool XMVector4LessOrEqual 9341( 9342 FXMVECTOR V1, 9343 FXMVECTOR V2 9344) 9345{ 9346#if defined(_XM_NO_INTRINSICS_) 9347 return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0); 9348#elif defined(_XM_ARM_NEON_INTRINSICS_) 9349 __n128 vResult = vcleq_f32( V1, V2 ); 9350 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 9351 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9352 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 9353#elif defined(_XM_SSE_INTRINSICS_) 9354 XMVECTOR vTemp = _mm_cmple_ps(V1,V2); 9355 return ((_mm_movemask_ps(vTemp)==0x0f) != 0); 9356#else 9357 return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1)); 9358#endif 9359} 9360 9361//------------------------------------------------------------------------------ 9362 9363inline bool XMVector4InBounds 9364( 9365 FXMVECTOR V, 9366 FXMVECTOR Bounds 9367) 9368{ 9369#if defined(_XM_NO_INTRINSICS_) 9370 return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 9371 (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) && 9372 (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) && 9373 (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0); 9374#elif defined(_XM_ARM_NEON_INTRINSICS_) 9375 // Test if less than or equal 9376 __n128 vTemp1 = vcleq_f32(V,Bounds); 9377 // Negate the bounds 9378 __n128 vTemp2 = vnegq_f32(Bounds); 9379 // Test if greater or equal (Reversed) 9380 vTemp2 = vcleq_f32(vTemp2,V); 9381 // Blend answers 9382 vTemp1 = vandq_u32(vTemp1,vTemp2); 9383 // in bounds? 9384 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1)); 9385 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9386 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ); 9387#elif defined(_XM_SSE_INTRINSICS_) 9388 // Test if less than or equal 9389 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds); 9390 // Negate the bounds 9391 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne); 9392 // Test if greater or equal (Reversed) 9393 vTemp2 = _mm_cmple_ps(vTemp2,V); 9394 // Blend answers 9395 vTemp1 = _mm_and_ps(vTemp1,vTemp2); 9396 // All in bounds? 9397 return ((_mm_movemask_ps(vTemp1)==0x0f) != 0); 9398#else 9399 return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds)); 9400#endif 9401} 9402 9403 9404//------------------------------------------------------------------------------ 9405 9406inline bool XMVector4IsNaN 9407( 9408 FXMVECTOR V 9409) 9410{ 9411#if defined(_XM_NO_INTRINSICS_) 9412 return (XMISNAN(V.vector4_f32[0]) || 9413 XMISNAN(V.vector4_f32[1]) || 9414 XMISNAN(V.vector4_f32[2]) || 9415 XMISNAN(V.vector4_f32[3])); 9416#elif defined(_XM_ARM_NEON_INTRINSICS_) 9417 // Test against itself. NaN is always not equal 9418 __n128 vTempNan = vceqq_f32( V, V ); 9419 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan)); 9420 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9421 // If any are NaN, the mask is zero 9422 return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU ); 9423#elif defined(_XM_SSE_INTRINSICS_) 9424 // Test against itself. NaN is always not equal 9425 XMVECTOR vTempNan = _mm_cmpneq_ps(V,V); 9426 // If any are NaN, the mask is non-zero 9427 return (_mm_movemask_ps(vTempNan)!=0); 9428#else // _XM_VMX128_INTRINSICS_ 9429#endif // _XM_VMX128_INTRINSICS_ 9430} 9431 9432//------------------------------------------------------------------------------ 9433 9434inline bool XMVector4IsInfinite 9435( 9436 FXMVECTOR V 9437) 9438{ 9439#if defined(_XM_NO_INTRINSICS_) 9440 9441 return (XMISINF(V.vector4_f32[0]) || 9442 XMISINF(V.vector4_f32[1]) || 9443 XMISINF(V.vector4_f32[2]) || 9444 XMISINF(V.vector4_f32[3])); 9445 9446#elif defined(_XM_ARM_NEON_INTRINSICS_) 9447 // Mask off the sign bit 9448 __n128 vTempInf = vandq_u32( V, g_XMAbsMask ); 9449 // Compare to infinity 9450 vTempInf = vceqq_f32(vTempInf, g_XMInfinity ); 9451 // If any are infinity, the signs are true. 9452 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf)); 9453 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 9454 return ( vget_lane_u32(vTemp.val[1], 1) != 0 ); 9455#elif defined(_XM_SSE_INTRINSICS_) 9456 // Mask off the sign bit 9457 XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask); 9458 // Compare to infinity 9459 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity); 9460 // If any are infinity, the signs are true. 9461 return (_mm_movemask_ps(vTemp) != 0); 9462#else // _XM_VMX128_INTRINSICS_ 9463#endif // _XM_VMX128_INTRINSICS_ 9464} 9465 9466//------------------------------------------------------------------------------ 9467// Computation operations 9468//------------------------------------------------------------------------------ 9469 9470//------------------------------------------------------------------------------ 9471 9472inline XMVECTOR XMVector4Dot 9473( 9474 FXMVECTOR V1, 9475 FXMVECTOR V2 9476) 9477{ 9478#if defined(_XM_NO_INTRINSICS_) 9479 9480 XMVECTOR Result; 9481 Result.vector4_f32[0] = 9482 Result.vector4_f32[1] = 9483 Result.vector4_f32[2] = 9484 Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3]; 9485 return Result; 9486 9487#elif defined(_XM_ARM_NEON_INTRINSICS_) 9488 __n128 vTemp = vmulq_f32( V1, V2 ); 9489 __n64 v1 = vget_low_f32( vTemp ); 9490 __n64 v2 = vget_high_f32( vTemp ); 9491 v1 = vpadd_f32( v1, v1 ); 9492 v2 = vpadd_f32( v2, v2 ); 9493 v1 = vadd_f32( v1, v2 ); 9494 return vcombine_f32( v1, v1 ); 9495#elif defined(_XM_SSE_INTRINSICS_) 9496 XMVECTOR vTemp2 = V2; 9497 XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2); 9498 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position 9499 vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W; 9500 vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position 9501 vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together 9502 return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return 9503#else // _XM_VMX128_INTRINSICS_ 9504#endif // _XM_VMX128_INTRINSICS_ 9505} 9506 9507//------------------------------------------------------------------------------ 9508 9509inline XMVECTOR XMVector4Cross 9510( 9511 FXMVECTOR V1, 9512 FXMVECTOR V2, 9513 FXMVECTOR V3 9514) 9515{ 9516 // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w), 9517 // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w), 9518 // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w), 9519 // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ] 9520 9521#if defined(_XM_NO_INTRINSICS_) 9522 XMVECTOR Result; 9523 9524 Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]); 9525 Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]); 9526 Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]); 9527 Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]); 9528 return Result; 9529 9530#elif defined(_XM_ARM_NEON_INTRINSICS_) 9531 const __n64 select = vget_low_f32( g_XMMaskX ); 9532 9533 // Term1: V2zwyz * V3wzwy 9534 const __n64 v2xy = vget_low_f32(V2); 9535 const __n64 v2zw = vget_high_f32(V2); 9536 const __n64 v2yx = vrev64_f32(v2xy); 9537 const __n64 v2wz = vrev64_f32(v2zw); 9538 const __n64 v2yz = vbsl_f32( select, v2yx, v2wz ); 9539 9540 const __n64 v3zw = vget_high_f32(V3); 9541 const __n64 v3wz = vrev64_f32(v3zw); 9542 const __n64 v3xy = vget_low_f32(V3); 9543 const __n64 v3wy = vbsl_f32( select, v3wz, v3xy ); 9544 9545 __n128 vTemp1 = vcombine_f32(v2zw,v2yz); 9546 __n128 vTemp2 = vcombine_f32(v3wz,v3wy); 9547 __n128 vResult = vmulq_f32( vTemp1, vTemp2 ); 9548 9549 // - V2wzwy * V3zwyz 9550 const __n64 v2wy = vbsl_f32( select, v2wz, v2xy ); 9551 9552 const __n64 v3yx = vrev64_f32(v3xy); 9553 const __n64 v3yz = vbsl_f32( select, v3yx, v3wz ); 9554 9555 vTemp1 = vcombine_f32(v2wz,v2wy); 9556 vTemp2 = vcombine_f32(v3zw,v3yz); 9557 vResult = vmlsq_f32( vResult, vTemp1, vTemp2 ); 9558 9559 // term1 * V1yxxx 9560 const __n64 v1xy = vget_low_f32(V1); 9561 const __n64 v1yx = vrev64_f32(v1xy); 9562 9563 vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) ); 9564 vResult = vmulq_f32( vResult, vTemp1 ); 9565 9566 // Term2: V2ywxz * V3wxwx 9567 const __n64 v2yw = vrev64_f32(v2wy); 9568 const __n64 v2xz = vbsl_f32( select, v2xy, v2wz ); 9569 9570 const __n64 v3wx = vbsl_f32( select, v3wz, v3yx ); 9571 9572 vTemp1 = vcombine_f32(v2yw,v2xz); 9573 vTemp2 = vcombine_f32(v3wx,v3wx); 9574 __n128 vTerm = vmulq_f32( vTemp1, vTemp2 ); 9575 9576 // - V2wxwx * V3ywxz 9577 const __n64 v2wx = vbsl_f32( select, v2wz, v2yx ); 9578 9579 const __n64 v3yw = vrev64_f32(v3wy); 9580 const __n64 v3xz = vbsl_f32( select, v3xy, v3wz ); 9581 9582 vTemp1 = vcombine_f32(v2wx,v2wx); 9583 vTemp2 = vcombine_f32(v3yw,v3xz); 9584 vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); 9585 9586 // vResult - term2 * V1zzyy 9587 const __n64 v1zw = vget_high_f32(V1); 9588 9589 vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) ); 9590 vResult = vmlsq_f32( vResult, vTerm, vTemp1 ); 9591 9592 // Term3: V2yzxy * V3zxyx 9593 const __n64 v3zx = vrev64_f32(v3xz); 9594 9595 vTemp1 = vcombine_f32(v2yz,v2xy); 9596 vTemp2 = vcombine_f32(v3zx,v3yx); 9597 vTerm = vmulq_f32( vTemp1, vTemp2 ); 9598 9599 // - V2zxyx * V3yzxy 9600 const __n64 v2zx = vrev64_f32(v2xz); 9601 9602 vTemp1 = vcombine_f32(v2zx,v2yx); 9603 vTemp2 = vcombine_f32(v3yz,v3xy); 9604 vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 ); 9605 9606 // vResult + term3 * V1wwwz 9607 const __n64 v1wz = vrev64_f32(v1zw); 9608 9609 vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz ); 9610 return vmlaq_f32( vResult, vTerm, vTemp1 ); 9611#elif defined(_XM_SSE_INTRINSICS_) 9612 // V2zwyz * V3wzwy 9613 XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2)); 9614 XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3)); 9615 vResult = _mm_mul_ps(vResult,vTemp3); 9616 // - V2wzwy * V3zwyz 9617 XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3)); 9618 vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1)); 9619 vTemp2 = _mm_mul_ps(vTemp2,vTemp3); 9620 vResult = _mm_sub_ps(vResult,vTemp2); 9621 // term1 * V1yxxx 9622 XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1)); 9623 vResult = _mm_mul_ps(vResult,vTemp1); 9624 9625 // V2ywxz * V3wxwx 9626 vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1)); 9627 vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3)); 9628 vTemp3 = _mm_mul_ps(vTemp3,vTemp2); 9629 // - V2wxwx * V3ywxz 9630 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1)); 9631 vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1)); 9632 vTemp2 = _mm_mul_ps(vTemp2,vTemp1); 9633 vTemp3 = _mm_sub_ps(vTemp3,vTemp2); 9634 // vResult - temp * V1zzyy 9635 vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2)); 9636 vTemp1 = _mm_mul_ps(vTemp1,vTemp3); 9637 vResult = _mm_sub_ps(vResult,vTemp1); 9638 9639 // V2yzxy * V3zxyx 9640 vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1)); 9641 vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2)); 9642 vTemp3 = _mm_mul_ps(vTemp3,vTemp2); 9643 // - V2zxyx * V3yzxy 9644 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1)); 9645 vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1)); 9646 vTemp1 = _mm_mul_ps(vTemp1,vTemp2); 9647 vTemp3 = _mm_sub_ps(vTemp3,vTemp1); 9648 // vResult + term * V1wwwz 9649 vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3)); 9650 vTemp3 = _mm_mul_ps(vTemp3,vTemp1); 9651 vResult = _mm_add_ps(vResult,vTemp3); 9652 return vResult; 9653#else // _XM_VMX128_INTRINSICS_ 9654#endif // _XM_VMX128_INTRINSICS_ 9655} 9656 9657//------------------------------------------------------------------------------ 9658 9659inline XMVECTOR XMVector4LengthSq 9660( 9661 FXMVECTOR V 9662) 9663{ 9664 return XMVector4Dot(V, V); 9665} 9666 9667//------------------------------------------------------------------------------ 9668 9669inline XMVECTOR XMVector4ReciprocalLengthEst 9670( 9671 FXMVECTOR V 9672) 9673{ 9674#if defined(_XM_NO_INTRINSICS_) 9675 9676 XMVECTOR Result; 9677 9678 Result = XMVector4LengthSq(V); 9679 Result = XMVectorReciprocalSqrtEst(Result); 9680 9681 return Result; 9682 9683#elif defined(_XM_ARM_NEON_INTRINSICS_) 9684 // Dot4 9685 __n128 vTemp = vmulq_f32( V, V ); 9686 __n64 v1 = vget_low_f32( vTemp ); 9687 __n64 v2 = vget_high_f32( vTemp ); 9688 v1 = vpadd_f32( v1, v1 ); 9689 v2 = vpadd_f32( v2, v2 ); 9690 v1 = vadd_f32( v1, v2 ); 9691 // Reciprocal sqrt (estimate) 9692 v2 = vrsqrte_f32( v1 ); 9693 return vcombine_f32(v2, v2); 9694#elif defined(_XM_SSE_INTRINSICS_) 9695 // Perform the dot product on x,y,z and w 9696 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 9697 // vTemp has z and w 9698 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); 9699 // x+z, y+w 9700 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9701 // x+z,x+z,x+z,y+w 9702 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); 9703 // ??,??,y+w,y+w 9704 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); 9705 // ??,??,x+z+y+w,?? 9706 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9707 // Splat the length 9708 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); 9709 // Get the reciprocal 9710 vLengthSq = _mm_rsqrt_ps(vLengthSq); 9711 return vLengthSq; 9712#else // _XM_VMX128_INTRINSICS_ 9713#endif // _XM_VMX128_INTRINSICS_ 9714} 9715 9716//------------------------------------------------------------------------------ 9717 9718inline XMVECTOR XMVector4ReciprocalLength 9719( 9720 FXMVECTOR V 9721) 9722{ 9723#if defined(_XM_NO_INTRINSICS_) 9724 9725 XMVECTOR Result; 9726 9727 Result = XMVector4LengthSq(V); 9728 Result = XMVectorReciprocalSqrt(Result); 9729 9730 return Result; 9731 9732#elif defined(_XM_ARM_NEON_INTRINSICS_) 9733 // Dot4 9734 __n128 vTemp = vmulq_f32( V, V ); 9735 __n64 v1 = vget_low_f32( vTemp ); 9736 __n64 v2 = vget_high_f32( vTemp ); 9737 v1 = vpadd_f32( v1, v1 ); 9738 v2 = vpadd_f32( v2, v2 ); 9739 v1 = vadd_f32( v1, v2 ); 9740 // Reciprocal sqrt 9741 __n64 S0 = vrsqrte_f32(v1); 9742 __n64 P0 = vmul_f32( v1, S0 ); 9743 __n64 R0 = vrsqrts_f32( P0, S0 ); 9744 __n64 S1 = vmul_f32( S0, R0 ); 9745 __n64 P1 = vmul_f32( v1, S1 ); 9746 __n64 R1 = vrsqrts_f32( P1, S1 ); 9747 __n64 Result = vmul_f32( S1, R1 ); 9748 return vcombine_f32( Result, Result ); 9749#elif defined(_XM_SSE_INTRINSICS_) 9750 // Perform the dot product on x,y,z and w 9751 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 9752 // vTemp has z and w 9753 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); 9754 // x+z, y+w 9755 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9756 // x+z,x+z,x+z,y+w 9757 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); 9758 // ??,??,y+w,y+w 9759 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); 9760 // ??,??,x+z+y+w,?? 9761 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9762 // Splat the length 9763 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); 9764 // Get the reciprocal 9765 vLengthSq = _mm_sqrt_ps(vLengthSq); 9766 // Accurate! 9767 vLengthSq = _mm_div_ps(g_XMOne,vLengthSq); 9768 return vLengthSq; 9769#else // _XM_VMX128_INTRINSICS_ 9770#endif // _XM_VMX128_INTRINSICS_ 9771} 9772 9773//------------------------------------------------------------------------------ 9774 9775inline XMVECTOR XMVector4LengthEst 9776( 9777 FXMVECTOR V 9778) 9779{ 9780#if defined(_XM_NO_INTRINSICS_) 9781 9782 XMVECTOR Result; 9783 9784 Result = XMVector4LengthSq(V); 9785 Result = XMVectorSqrtEst(Result); 9786 9787 return Result; 9788 9789#elif defined(_XM_ARM_NEON_INTRINSICS_) 9790 // Dot4 9791 __n128 vTemp = vmulq_f32( V, V ); 9792 __n64 v1 = vget_low_f32( vTemp ); 9793 __n64 v2 = vget_high_f32( vTemp ); 9794 v1 = vpadd_f32( v1, v1 ); 9795 v2 = vpadd_f32( v2, v2 ); 9796 v1 = vadd_f32( v1, v2 ); 9797 const __n64 zero = vdup_n_u32(0); 9798 __n64 VEqualsZero = vceq_f32( v1, zero ); 9799 // Sqrt (estimate) 9800 __n64 Result = vrsqrte_f32( v1 ); 9801 Result = vmul_f32( v1, Result ); 9802 Result = vbsl_f32( VEqualsZero, zero, Result ); 9803 return vcombine_f32( Result, Result ); 9804#elif defined(_XM_SSE_INTRINSICS_) 9805 // Perform the dot product on x,y,z and w 9806 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 9807 // vTemp has z and w 9808 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); 9809 // x+z, y+w 9810 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9811 // x+z,x+z,x+z,y+w 9812 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); 9813 // ??,??,y+w,y+w 9814 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); 9815 // ??,??,x+z+y+w,?? 9816 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9817 // Splat the length 9818 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); 9819 // Prepare for the division 9820 vLengthSq = _mm_sqrt_ps(vLengthSq); 9821 return vLengthSq; 9822#else // _XM_VMX128_INTRINSICS_ 9823#endif // _XM_VMX128_INTRINSICS_ 9824} 9825 9826//------------------------------------------------------------------------------ 9827 9828inline XMVECTOR XMVector4Length 9829( 9830 FXMVECTOR V 9831) 9832{ 9833#if defined(_XM_NO_INTRINSICS_) 9834 9835 XMVECTOR Result; 9836 9837 Result = XMVector4LengthSq(V); 9838 Result = XMVectorSqrt(Result); 9839 9840 return Result; 9841 9842#elif defined(_XM_ARM_NEON_INTRINSICS_) 9843 // Dot4 9844 __n128 vTemp = vmulq_f32( V, V ); 9845 __n64 v1 = vget_low_f32( vTemp ); 9846 __n64 v2 = vget_high_f32( vTemp ); 9847 v1 = vpadd_f32( v1, v1 ); 9848 v2 = vpadd_f32( v2, v2 ); 9849 v1 = vadd_f32( v1, v2 ); 9850 const __n64 zero = vdup_n_u32(0); 9851 __n64 VEqualsZero = vceq_f32( v1, zero ); 9852 // Sqrt 9853 __n64 S0 = vrsqrte_f32( v1 ); 9854 __n64 P0 = vmul_f32( v1, S0 ); 9855 __n64 R0 = vrsqrts_f32( P0, S0 ); 9856 __n64 S1 = vmul_f32( S0, R0 ); 9857 __n64 P1 = vmul_f32( v1, S1 ); 9858 __n64 R1 = vrsqrts_f32( P1, S1 ); 9859 __n64 Result = vmul_f32( S1, R1 ); 9860 Result = vmul_f32( v1, Result ); 9861 Result = vbsl_f32( VEqualsZero, zero, Result ); 9862 return vcombine_f32( Result, Result ); 9863#elif defined(_XM_SSE_INTRINSICS_) 9864 // Perform the dot product on x,y,z and w 9865 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 9866 // vTemp has z and w 9867 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); 9868 // x+z, y+w 9869 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9870 // x+z,x+z,x+z,y+w 9871 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); 9872 // ??,??,y+w,y+w 9873 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); 9874 // ??,??,x+z+y+w,?? 9875 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9876 // Splat the length 9877 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); 9878 // Prepare for the division 9879 vLengthSq = _mm_sqrt_ps(vLengthSq); 9880 return vLengthSq; 9881#else // _XM_VMX128_INTRINSICS_ 9882#endif // _XM_VMX128_INTRINSICS_ 9883} 9884 9885//------------------------------------------------------------------------------ 9886// XMVector4NormalizeEst uses a reciprocal estimate and 9887// returns QNaN on zero and infinite vectors. 9888 9889inline XMVECTOR XMVector4NormalizeEst 9890( 9891 FXMVECTOR V 9892) 9893{ 9894#if defined(_XM_NO_INTRINSICS_) 9895 9896 XMVECTOR Result; 9897 Result = XMVector4ReciprocalLength(V); 9898 Result = XMVectorMultiply(V, Result); 9899 return Result; 9900 9901#elif defined(_XM_ARM_NEON_INTRINSICS_) 9902 // Dot4 9903 __n128 vTemp = vmulq_f32( V, V ); 9904 __n64 v1 = vget_low_f32( vTemp ); 9905 __n64 v2 = vget_high_f32( vTemp ); 9906 v1 = vpadd_f32( v1, v1 ); 9907 v2 = vpadd_f32( v2, v2 ); 9908 v1 = vadd_f32( v1, v2 ); 9909 // Reciprocal sqrt (estimate) 9910 v2 = vrsqrte_f32( v1 ); 9911 // Normalize 9912 return vmulq_f32( V, vcombine_f32(v2,v2) ); 9913#elif defined(_XM_SSE_INTRINSICS_) 9914 // Perform the dot product on x,y,z and w 9915 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 9916 // vTemp has z and w 9917 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); 9918 // x+z, y+w 9919 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9920 // x+z,x+z,x+z,y+w 9921 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); 9922 // ??,??,y+w,y+w 9923 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); 9924 // ??,??,x+z+y+w,?? 9925 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9926 // Splat the length 9927 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); 9928 // Get the reciprocal 9929 XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq); 9930 // Reciprocal mul to perform the normalization 9931 vResult = _mm_mul_ps(vResult,V); 9932 return vResult; 9933#else // _XM_VMX128_INTRINSICS_ 9934#endif // _XM_VMX128_INTRINSICS_ 9935} 9936 9937//------------------------------------------------------------------------------ 9938 9939inline XMVECTOR XMVector4Normalize 9940( 9941 FXMVECTOR V 9942) 9943{ 9944#if defined(_XM_NO_INTRINSICS_) 9945 float fLength; 9946 XMVECTOR vResult; 9947 9948 vResult = XMVector4Length( V ); 9949 fLength = vResult.vector4_f32[0]; 9950 9951 // Prevent divide by zero 9952 if (fLength > 0) { 9953 fLength = 1.0f/fLength; 9954 } 9955 9956 vResult.vector4_f32[0] = V.vector4_f32[0]*fLength; 9957 vResult.vector4_f32[1] = V.vector4_f32[1]*fLength; 9958 vResult.vector4_f32[2] = V.vector4_f32[2]*fLength; 9959 vResult.vector4_f32[3] = V.vector4_f32[3]*fLength; 9960 return vResult; 9961 9962#elif defined(_XM_ARM_NEON_INTRINSICS_) 9963 // Dot4 9964 __n128 vTemp = vmulq_f32( V, V ); 9965 __n64 v1 = vget_low_f32( vTemp ); 9966 __n64 v2 = vget_high_f32( vTemp ); 9967 v1 = vpadd_f32( v1, v1 ); 9968 v2 = vpadd_f32( v2, v2 ); 9969 v1 = vadd_f32( v1, v2 ); 9970 __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) ); 9971 __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) ); 9972 // Reciprocal sqrt (2 iterations of Newton-Raphson) 9973 __n64 S0 = vrsqrte_f32( v1 ); 9974 __n64 P0 = vmul_f32( v1, S0 ); 9975 __n64 R0 = vrsqrts_f32( P0, S0 ); 9976 __n64 S1 = vmul_f32( S0, R0 ); 9977 __n64 P1 = vmul_f32( v1, S1 ); 9978 __n64 R1 = vrsqrts_f32( P1, S1 ); 9979 v2 = vmul_f32( S1, R1 ); 9980 // Normalize 9981 __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) ); 9982 vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult ); 9983 return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult ); 9984#elif defined(_XM_SSE_INTRINSICS_) 9985 // Perform the dot product on x,y,z and w 9986 XMVECTOR vLengthSq = _mm_mul_ps(V,V); 9987 // vTemp has z and w 9988 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2)); 9989 // x+z, y+w 9990 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9991 // x+z,x+z,x+z,y+w 9992 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0)); 9993 // ??,??,y+w,y+w 9994 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0)); 9995 // ??,??,x+z+y+w,?? 9996 vLengthSq = _mm_add_ps(vLengthSq,vTemp); 9997 // Splat the length 9998 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2)); 9999 // Prepare for the division 10000 XMVECTOR vResult = _mm_sqrt_ps(vLengthSq); 10001 // Create zero with a single instruction 10002 XMVECTOR vZeroMask = _mm_setzero_ps(); 10003 // Test for a divide by zero (Must be FP to detect -0.0) 10004 vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult); 10005 // Failsafe on zero (Or epsilon) length planes 10006 // If the length is infinity, set the elements to zero 10007 vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity); 10008 // Divide to perform the normalization 10009 vResult = _mm_div_ps(V,vResult); 10010 // Any that are infinity, set to zero 10011 vResult = _mm_and_ps(vResult,vZeroMask); 10012 // Select qnan or result based on infinite length 10013 XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN); 10014 XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq); 10015 vResult = _mm_or_ps(vTemp1,vTemp2); 10016 return vResult; 10017#else // _XM_VMX128_INTRINSICS_ 10018#endif // _XM_VMX128_INTRINSICS_ 10019} 10020 10021//------------------------------------------------------------------------------ 10022 10023inline XMVECTOR XMVector4ClampLength 10024( 10025 FXMVECTOR V, 10026 float LengthMin, 10027 float LengthMax 10028) 10029{ 10030 XMVECTOR ClampMax = XMVectorReplicate(LengthMax); 10031 XMVECTOR ClampMin = XMVectorReplicate(LengthMin); 10032 10033 return XMVector4ClampLengthV(V, ClampMin, ClampMax); 10034} 10035 10036//------------------------------------------------------------------------------ 10037 10038inline XMVECTOR XMVector4ClampLengthV 10039( 10040 FXMVECTOR V, 10041 FXMVECTOR LengthMin, 10042 FXMVECTOR LengthMax 10043) 10044{ 10045 assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin))); 10046 assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax))); 10047 assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero())); 10048 assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero())); 10049 assert(XMVector4GreaterOrEqual(LengthMax, LengthMin)); 10050 10051 XMVECTOR LengthSq = XMVector4LengthSq(V); 10052 10053 const XMVECTOR Zero = XMVectorZero(); 10054 10055 XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq); 10056 10057 XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v); 10058 XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero); 10059 10060 XMVECTOR Normal = XMVectorMultiply(V, RcpLength); 10061 10062 XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength); 10063 10064 XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength); 10065 Length = XMVectorSelect(LengthSq, Length, Select); 10066 Normal = XMVectorSelect(LengthSq, Normal, Select); 10067 10068 XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax); 10069 XMVECTOR ControlMin = XMVectorLess(Length, LengthMin); 10070 10071 XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax); 10072 ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin); 10073 10074 XMVECTOR Result = XMVectorMultiply(Normal, ClampLength); 10075 10076 // Preserve the original vector (with no precision loss) if the length falls within the given range 10077 XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin); 10078 Result = XMVectorSelect(Result, V, Control); 10079 10080 return Result; 10081} 10082 10083//------------------------------------------------------------------------------ 10084 10085inline XMVECTOR XMVector4Reflect 10086( 10087 FXMVECTOR Incident, 10088 FXMVECTOR Normal 10089) 10090{ 10091 // Result = Incident - (2 * dot(Incident, Normal)) * Normal 10092 10093 XMVECTOR Result = XMVector4Dot(Incident, Normal); 10094 Result = XMVectorAdd(Result, Result); 10095 Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident); 10096 10097 return Result; 10098} 10099 10100//------------------------------------------------------------------------------ 10101 10102inline XMVECTOR XMVector4Refract 10103( 10104 FXMVECTOR Incident, 10105 FXMVECTOR Normal, 10106 float RefractionIndex 10107) 10108{ 10109 XMVECTOR Index = XMVectorReplicate(RefractionIndex); 10110 return XMVector4RefractV(Incident, Normal, Index); 10111} 10112 10113//------------------------------------------------------------------------------ 10114 10115inline XMVECTOR XMVector4RefractV 10116( 10117 FXMVECTOR Incident, 10118 FXMVECTOR Normal, 10119 FXMVECTOR RefractionIndex 10120) 10121{ 10122#if defined(_XM_NO_INTRINSICS_) 10123 10124 XMVECTOR IDotN; 10125 XMVECTOR R; 10126 const XMVECTOR Zero = XMVectorZero(); 10127 10128 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 10129 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal)))) 10130 10131 IDotN = XMVector4Dot(Incident, Normal); 10132 10133 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 10134 R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v); 10135 R = XMVectorMultiply(R, RefractionIndex); 10136 R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v); 10137 10138 if (XMVector4LessOrEqual(R, Zero)) 10139 { 10140 // Total internal reflection 10141 return Zero; 10142 } 10143 else 10144 { 10145 XMVECTOR Result; 10146 10147 // R = RefractionIndex * IDotN + sqrt(R) 10148 R = XMVectorSqrt(R); 10149 R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R); 10150 10151 // Result = RefractionIndex * Incident - Normal * R 10152 Result = XMVectorMultiply(RefractionIndex, Incident); 10153 Result = XMVectorNegativeMultiplySubtract(Normal, R, Result); 10154 10155 return Result; 10156 } 10157 10158#elif defined(_XM_ARM_NEON_INTRINSICS_) 10159 XMVECTOR IDotN = XMVector4Dot(Incident,Normal); 10160 10161 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 10162 __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN); 10163 R = vmulq_f32(R, RefractionIndex); 10164 R = vmlsq_f32(g_XMOne, R, RefractionIndex ); 10165 10166 __n128 vResult = vcleq_f32(R,g_XMZero); 10167 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult)); 10168 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]); 10169 if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU ) 10170 { 10171 // Total internal reflection 10172 vResult = g_XMZero; 10173 } 10174 else 10175 { 10176 // Sqrt(R) 10177 __n128 S0 = vrsqrteq_f32(R); 10178 __n128 P0 = vmulq_f32( R, S0 ); 10179 __n128 R0 = vrsqrtsq_f32( P0, S0 ); 10180 __n128 S1 = vmulq_f32( S0, R0 ); 10181 __n128 P1 = vmulq_f32( R, S1 ); 10182 __n128 R1 = vrsqrtsq_f32( P1, S1 ); 10183 __n128 S2 = vmulq_f32( S1, R1 ); 10184 R = vmulq_f32( R, S2 ); 10185 // R = RefractionIndex * IDotN + sqrt(R) 10186 R = vmlaq_f32( R, RefractionIndex, IDotN ); 10187 // Result = RefractionIndex * Incident - Normal * R 10188 vResult = vmulq_f32(RefractionIndex, Incident); 10189 vResult = vmlsq_f32( vResult, R, Normal ); 10190 } 10191 return vResult; 10192#elif defined(_XM_SSE_INTRINSICS_) 10193 XMVECTOR IDotN = XMVector4Dot(Incident,Normal); 10194 10195 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN) 10196 XMVECTOR R = _mm_mul_ps(IDotN,IDotN); 10197 R = _mm_sub_ps(g_XMOne,R); 10198 R = _mm_mul_ps(R, RefractionIndex); 10199 R = _mm_mul_ps(R, RefractionIndex); 10200 R = _mm_sub_ps(g_XMOne,R); 10201 10202 XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero); 10203 if (_mm_movemask_ps(vResult)==0x0f) 10204 { 10205 // Total internal reflection 10206 vResult = g_XMZero; 10207 } 10208 else 10209 { 10210 // R = RefractionIndex * IDotN + sqrt(R) 10211 R = _mm_sqrt_ps(R); 10212 vResult = _mm_mul_ps(RefractionIndex, IDotN); 10213 R = _mm_add_ps(R,vResult); 10214 // Result = RefractionIndex * Incident - Normal * R 10215 vResult = _mm_mul_ps(RefractionIndex, Incident); 10216 R = _mm_mul_ps(R,Normal); 10217 vResult = _mm_sub_ps(vResult,R); 10218 } 10219 return vResult; 10220#else // _XM_VMX128_INTRINSICS_ 10221#endif // _XM_VMX128_INTRINSICS_ 10222} 10223 10224//------------------------------------------------------------------------------ 10225 10226inline XMVECTOR XMVector4Orthogonal 10227( 10228 FXMVECTOR V 10229) 10230{ 10231#if defined(_XM_NO_INTRINSICS_) 10232 10233 XMVECTOR Result; 10234 Result.vector4_f32[0] = V.vector4_f32[2]; 10235 Result.vector4_f32[1] = V.vector4_f32[3]; 10236 Result.vector4_f32[2] = -V.vector4_f32[0]; 10237 Result.vector4_f32[3] = -V.vector4_f32[1]; 10238 return Result; 10239 10240#elif defined(_XM_ARM_NEON_INTRINSICS_) 10241 static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f }; 10242 10243 __n128 Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) ); 10244 return vmulq_f32( Result, Negate ); 10245#elif defined(_XM_SSE_INTRINSICS_) 10246 static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f}; 10247 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2)); 10248 vResult = _mm_mul_ps(vResult,FlipZW); 10249 return vResult; 10250#else // _XM_VMX128_INTRINSICS_ 10251#endif // _XM_VMX128_INTRINSICS_ 10252} 10253 10254//------------------------------------------------------------------------------ 10255 10256inline XMVECTOR XMVector4AngleBetweenNormalsEst 10257( 10258 FXMVECTOR N1, 10259 FXMVECTOR N2 10260) 10261{ 10262#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 10263 10264 XMVECTOR Result = XMVector4Dot(N1, N2); 10265 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); 10266 Result = XMVectorACosEst(Result); 10267 return Result; 10268 10269#else // _XM_VMX128_INTRINSICS_ 10270#endif // _XM_VMX128_INTRINSICS_ 10271} 10272 10273//------------------------------------------------------------------------------ 10274 10275inline XMVECTOR XMVector4AngleBetweenNormals 10276( 10277 FXMVECTOR N1, 10278 FXMVECTOR N2 10279) 10280{ 10281#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 10282 10283 XMVECTOR Result = XMVector4Dot(N1, N2); 10284 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v); 10285 Result = XMVectorACos(Result); 10286 return Result; 10287 10288#else // _XM_VMX128_INTRINSICS_ 10289#endif // _XM_VMX128_INTRINSICS_ 10290} 10291 10292//------------------------------------------------------------------------------ 10293 10294inline XMVECTOR XMVector4AngleBetweenVectors 10295( 10296 FXMVECTOR V1, 10297 FXMVECTOR V2 10298) 10299{ 10300#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 10301 10302 XMVECTOR L1 = XMVector4ReciprocalLength(V1); 10303 XMVECTOR L2 = XMVector4ReciprocalLength(V2); 10304 10305 XMVECTOR Dot = XMVector4Dot(V1, V2); 10306 10307 L1 = XMVectorMultiply(L1, L2); 10308 10309 XMVECTOR CosAngle = XMVectorMultiply(Dot, L1); 10310 CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v); 10311 10312 return XMVectorACos(CosAngle); 10313 10314#else // _XM_VMX128_INTRINSICS_ 10315#endif // _XM_VMX128_INTRINSICS_ 10316} 10317 10318//------------------------------------------------------------------------------ 10319 10320inline XMVECTOR XMVector4Transform 10321( 10322 FXMVECTOR V, 10323 CXMMATRIX M 10324) 10325{ 10326#if defined(_XM_NO_INTRINSICS_) 10327 float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]); 10328 float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]); 10329 float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]); 10330 float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]); 10331 XMVECTOR vResult = { 10332 fX, 10333 fY, 10334 fZ, 10335 fW 10336 }; 10337 return vResult; 10338 10339#elif defined(_XM_ARM_NEON_INTRINSICS_) 10340 __n64 VL = vget_low_f32( V ); 10341 XMVECTOR vTemp1 = vdupq_lane_f32( VL, 0 ); // X 10342 XMVECTOR vTemp2 = vdupq_lane_f32( VL, 1 ); // Y 10343 XMVECTOR vResult = vmulq_f32( vTemp1, M.r[0] ); 10344 vResult = vmlaq_f32( vResult, vTemp2, M.r[1] ); 10345 __n64 VH = vget_high_f32( V ); 10346 vTemp1 = vdupq_lane_f32( VH, 0 ); // Z 10347 vTemp2 = vdupq_lane_f32( VH, 1 ); // W 10348 vResult = vmlaq_f32( vResult, vTemp1, M.r[2] ); 10349 return vmlaq_f32( vResult, vTemp2, M.r[3] ); 10350#elif defined(_XM_SSE_INTRINSICS_) 10351 // Splat x,y,z and w 10352 XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0)); 10353 XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1)); 10354 XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2)); 10355 XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3)); 10356 // Mul by the matrix 10357 vTempX = _mm_mul_ps(vTempX,M.r[0]); 10358 vTempY = _mm_mul_ps(vTempY,M.r[1]); 10359 vTempZ = _mm_mul_ps(vTempZ,M.r[2]); 10360 vTempW = _mm_mul_ps(vTempW,M.r[3]); 10361 // Add them all together 10362 vTempX = _mm_add_ps(vTempX,vTempY); 10363 vTempZ = _mm_add_ps(vTempZ,vTempW); 10364 vTempX = _mm_add_ps(vTempX,vTempZ); 10365 return vTempX; 10366#else // _XM_VMX128_INTRINSICS_ 10367#endif // _XM_VMX128_INTRINSICS_ 10368} 10369 10370//------------------------------------------------------------------------------ 10371_Use_decl_annotations_ 10372inline XMFLOAT4* XMVector4TransformStream 10373( 10374 XMFLOAT4* pOutputStream, 10375 size_t OutputStride, 10376 const XMFLOAT4* pInputStream, 10377 size_t InputStride, 10378 size_t VectorCount, 10379 CXMMATRIX M 10380) 10381{ 10382 assert(pOutputStream != NULL); 10383 assert(pInputStream != NULL); 10384 10385#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_) 10386 10387 const uint8_t* pInputVector = (const uint8_t*)pInputStream; 10388 uint8_t* pOutputVector = (uint8_t*)pOutputStream; 10389 10390 const XMVECTOR row0 = M.r[0]; 10391 const XMVECTOR row1 = M.r[1]; 10392 const XMVECTOR row2 = M.r[2]; 10393 const XMVECTOR row3 = M.r[3]; 10394 10395 for (size_t i = 0; i < VectorCount; i++) 10396 { 10397 XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector); 10398 XMVECTOR W = XMVectorSplatW(V); 10399 XMVECTOR Z = XMVectorSplatZ(V); 10400 XMVECTOR Y = XMVectorSplatY(V); 10401 XMVECTOR X = XMVectorSplatX(V); 10402 10403 XMVECTOR Result = XMVectorMultiply(W, row3); 10404 Result = XMVectorMultiplyAdd(Z, row2, Result); 10405 Result = XMVectorMultiplyAdd(Y, row1, Result); 10406 Result = XMVectorMultiplyAdd(X, row0, Result); 10407 10408 XMStoreFloat4((XMFLOAT4*)pOutputVector, Result); 10409 10410 pInputVector += InputStride; 10411 pOutputVector += OutputStride; 10412 } 10413 10414 return pOutputStream; 10415 10416#else // _XM_VMX128_INTRINSICS_ 10417#endif // _XM_VMX128_INTRINSICS_ 10418} 10419 10420/**************************************************************************** 10421 * 10422 * XMVECTOR operators 10423 * 10424 ****************************************************************************/ 10425 10426//------------------------------------------------------------------------------ 10427 10428inline XMVECTOR operator+ (FXMVECTOR V) 10429{ 10430 return V; 10431} 10432 10433//------------------------------------------------------------------------------ 10434 10435inline XMVECTOR operator- (FXMVECTOR V) 10436{ 10437 return XMVectorNegate(V); 10438} 10439 10440//------------------------------------------------------------------------------ 10441 10442inline XMVECTOR& operator+= 10443( 10444 XMVECTOR& V1, 10445 FXMVECTOR V2 10446) 10447{ 10448 V1 = XMVectorAdd(V1, V2); 10449 return V1; 10450} 10451 10452//------------------------------------------------------------------------------ 10453 10454inline XMVECTOR& operator-= 10455( 10456 XMVECTOR& V1, 10457 FXMVECTOR V2 10458) 10459{ 10460 V1 = XMVectorSubtract(V1, V2); 10461 return V1; 10462} 10463 10464//------------------------------------------------------------------------------ 10465 10466inline XMVECTOR& operator*= 10467( 10468 XMVECTOR& V1, 10469 FXMVECTOR V2 10470) 10471{ 10472 V1 = XMVectorMultiply(V1, V2); 10473 return V1; 10474} 10475 10476//------------------------------------------------------------------------------ 10477 10478inline XMVECTOR& operator/= 10479( 10480 XMVECTOR& V1, 10481 FXMVECTOR V2 10482) 10483{ 10484 V1 = XMVectorDivide(V1,V2); 10485 return V1; 10486} 10487 10488//------------------------------------------------------------------------------ 10489 10490inline XMVECTOR& operator*= 10491( 10492 XMVECTOR& V, 10493 const float S 10494) 10495{ 10496 V = XMVectorScale(V, S); 10497 return V; 10498} 10499 10500//------------------------------------------------------------------------------ 10501 10502inline XMVECTOR& operator/= 10503( 10504 XMVECTOR& V, 10505 const float S 10506) 10507{ 10508 assert( S != 0.0f ); 10509 V = XMVectorScale(V, 1.0f / S); 10510 return V; 10511} 10512 10513//------------------------------------------------------------------------------ 10514 10515inline XMVECTOR operator+ 10516( 10517 FXMVECTOR V1, 10518 FXMVECTOR V2 10519) 10520{ 10521 return XMVectorAdd(V1, V2); 10522} 10523 10524//------------------------------------------------------------------------------ 10525 10526inline XMVECTOR operator- 10527( 10528 FXMVECTOR V1, 10529 FXMVECTOR V2 10530) 10531{ 10532 return XMVectorSubtract(V1, V2); 10533} 10534 10535//------------------------------------------------------------------------------ 10536 10537inline XMVECTOR operator* 10538( 10539 FXMVECTOR V1, 10540 FXMVECTOR V2 10541) 10542{ 10543 return XMVectorMultiply(V1, V2); 10544} 10545 10546//------------------------------------------------------------------------------ 10547 10548inline XMVECTOR operator/ 10549( 10550 FXMVECTOR V1, 10551 FXMVECTOR V2 10552) 10553{ 10554 return XMVectorDivide(V1,V2); 10555} 10556 10557//------------------------------------------------------------------------------ 10558 10559inline XMVECTOR operator* 10560( 10561 FXMVECTOR V, 10562 const float S 10563) 10564{ 10565 return XMVectorScale(V, S); 10566} 10567 10568//------------------------------------------------------------------------------ 10569 10570inline XMVECTOR operator/ 10571( 10572 FXMVECTOR V, 10573 const float S 10574) 10575{ 10576 assert( S != 0.0f ); 10577 return XMVectorScale(V, 1.0f / S); 10578} 10579 10580//------------------------------------------------------------------------------ 10581 10582inline XMVECTOR operator* 10583( 10584 float S, 10585 FXMVECTOR V 10586) 10587{ 10588 return XMVectorScale(V, S); 10589} 10590 10591#if defined(_XM_NO_INTRINSICS_) 10592#undef XMISNAN 10593#undef XMISINF 10594#endif 10595 10596