Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathMisc.inl at main

ocrm.bsky.social / VoxelBlockGame
fork atom
the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
fork atom
VoxelBlockGame / Minecraft.Client / PS3 / PS3Extras / DirectX / DirectXMathMisc.inl
at main 2501 lines 75 kB view raw
wrap content
daoge_cmd Initial commit 17d ago
b691c43c
   1//-------------------------------------------------------------------------------------
   2// DirectXMathMisc.inl -- SIMD C++ Math library
   3//
   4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
   5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
   6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
   7// PARTICULAR PURPOSE.
   8//  
   9// Copyright (c) Microsoft Corporation. All rights reserved.
  10//-------------------------------------------------------------------------------------
  11
  12#ifdef _MSC_VER
  13#pragma once
  14#endif
  15
  16/****************************************************************************
  17 *
  18 * Quaternion
  19 *
  20 ****************************************************************************/
  21
  22//------------------------------------------------------------------------------
  23// Comparison operations
  24//------------------------------------------------------------------------------
  25
  26//------------------------------------------------------------------------------
  27
  28inline bool XMQuaternionEqual
  29(
  30    FXMVECTOR Q1,
  31    FXMVECTOR Q2
  32)
  33{
  34    return XMVector4Equal(Q1, Q2);
  35}
  36
  37//------------------------------------------------------------------------------
  38
  39inline bool XMQuaternionNotEqual
  40(
  41    FXMVECTOR Q1,
  42    FXMVECTOR Q2
  43)
  44{
  45    return XMVector4NotEqual(Q1, Q2);
  46}
  47
  48//------------------------------------------------------------------------------
  49
  50inline bool XMQuaternionIsNaN
  51(
  52    FXMVECTOR Q
  53)
  54{
  55    return XMVector4IsNaN(Q);
  56}
  57
  58//------------------------------------------------------------------------------
  59
  60inline bool XMQuaternionIsInfinite
  61(
  62    FXMVECTOR Q
  63)
  64{
  65    return XMVector4IsInfinite(Q);
  66}
  67
  68//------------------------------------------------------------------------------
  69
  70inline bool XMQuaternionIsIdentity
  71(
  72    FXMVECTOR Q
  73)
  74{
  75#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
  76    return XMVector4Equal(Q, g_XMIdentityR3.v);
  77#else // _XM_VMX128_INTRINSICS_
  78#endif // _XM_VMX128_INTRINSICS_
  79}
  80
  81//------------------------------------------------------------------------------
  82// Computation operations
  83//------------------------------------------------------------------------------
  84
  85//------------------------------------------------------------------------------
  86
  87inline XMVECTOR XMQuaternionDot
  88(
  89    FXMVECTOR Q1,
  90    FXMVECTOR Q2
  91)
  92{
  93    return XMVector4Dot(Q1, Q2);
  94}
  95
  96//------------------------------------------------------------------------------
  97
  98inline XMVECTOR XMQuaternionMultiply
  99(
 100    FXMVECTOR Q1,
 101    FXMVECTOR Q2
 102)
 103{
 104    // Returns the product Q2*Q1 (which is the concatenation of a rotation Q1 followed by the rotation Q2)
 105
 106    // [ (Q2.w * Q1.x) + (Q2.x * Q1.w) + (Q2.y * Q1.z) - (Q2.z * Q1.y),
 107    //   (Q2.w * Q1.y) - (Q2.x * Q1.z) + (Q2.y * Q1.w) + (Q2.z * Q1.x),
 108    //   (Q2.w * Q1.z) + (Q2.x * Q1.y) - (Q2.y * Q1.x) + (Q2.z * Q1.w),
 109    //   (Q2.w * Q1.w) - (Q2.x * Q1.x) - (Q2.y * Q1.y) - (Q2.z * Q1.z) ]
 110
 111#if defined(_XM_NO_INTRINSICS_)
 112    XMVECTOR Result = {
 113        (Q2.vector4_f32[3] * Q1.vector4_f32[0]) + (Q2.vector4_f32[0] * Q1.vector4_f32[3]) + (Q2.vector4_f32[1] * Q1.vector4_f32[2]) - (Q2.vector4_f32[2] * Q1.vector4_f32[1]),
 114        (Q2.vector4_f32[3] * Q1.vector4_f32[1]) - (Q2.vector4_f32[0] * Q1.vector4_f32[2]) + (Q2.vector4_f32[1] * Q1.vector4_f32[3]) + (Q2.vector4_f32[2] * Q1.vector4_f32[0]),
 115        (Q2.vector4_f32[3] * Q1.vector4_f32[2]) + (Q2.vector4_f32[0] * Q1.vector4_f32[1]) - (Q2.vector4_f32[1] * Q1.vector4_f32[0]) + (Q2.vector4_f32[2] * Q1.vector4_f32[3]),
 116        (Q2.vector4_f32[3] * Q1.vector4_f32[3]) - (Q2.vector4_f32[0] * Q1.vector4_f32[0]) - (Q2.vector4_f32[1] * Q1.vector4_f32[1]) - (Q2.vector4_f32[2] * Q1.vector4_f32[2]) };
 117    return Result;
 118#elif defined(_XM_ARM_NEON_INTRINSICS_)
 119    static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
 120    static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
 121    static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
 122
 123    __n64 Q2L = vget_low_f32(Q2);
 124    __n64 Q2H = vget_high_f32(Q2);
 125
 126    __n128 Q2X = vdupq_lane_f32( Q2L, 0 );
 127    __n128 Q2Y = vdupq_lane_f32( Q2L, 1 );
 128    __n128 Q2Z = vdupq_lane_f32( Q2H, 0 );
 129    __n128 vResult = vdupq_lane_f32( Q2H, 1 );
 130    vResult = vmulq_f32(vResult,Q1);
 131
 132    // Mul by Q1WZYX
 133    __n128 vTemp = vrev64q_u32(Q1);
 134    vTemp = vcombine_f32( vget_high_f32(vTemp), vget_low_f32(vTemp) );
 135    Q2X = vmulq_f32(Q2X,vTemp);
 136    vResult = vmlaq_f32( vResult, Q2X, ControlWZYX );
 137
 138    // Mul by Q1ZWXY
 139    vTemp = vrev64q_u32(vTemp);
 140    Q2Y = vmulq_f32(Q2Y,vTemp);
 141    vResult = vmlaq_f32(vResult, Q2Y, ControlZWXY);
 142
 143    // Mul by Q1YXWZ
 144    vTemp = vrev64q_u32(vTemp);
 145    vTemp = vcombine_f32(vget_high_f32(vTemp), vget_low_f32(vTemp));
 146    Q2Z = vmulq_f32(Q2Z,vTemp);
 147    vResult = vmlaq_f32(vResult, Q2Z, ControlYXWZ);
 148    return vResult;
 149#elif defined(_XM_SSE_INTRINSICS_)
 150    static const XMVECTORF32 ControlWZYX = { 1.0f,-1.0f, 1.0f,-1.0f};
 151    static const XMVECTORF32 ControlZWXY = { 1.0f, 1.0f,-1.0f,-1.0f};
 152    static const XMVECTORF32 ControlYXWZ = {-1.0f, 1.0f, 1.0f,-1.0f};
 153    // Copy to SSE registers and use as few as possible for x86
 154    XMVECTOR Q2X = Q2;
 155    XMVECTOR Q2Y = Q2;
 156    XMVECTOR Q2Z = Q2;
 157    XMVECTOR vResult = Q2;
 158    // Splat with one instruction
 159    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,3,3,3));
 160    Q2X = XM_PERMUTE_PS(Q2X,_MM_SHUFFLE(0,0,0,0));
 161    Q2Y = XM_PERMUTE_PS(Q2Y,_MM_SHUFFLE(1,1,1,1));
 162    Q2Z = XM_PERMUTE_PS(Q2Z,_MM_SHUFFLE(2,2,2,2));
 163    // Retire Q1 and perform Q1*Q2W
 164    vResult = _mm_mul_ps(vResult,Q1);
 165    XMVECTOR Q1Shuffle = Q1;
 166    // Shuffle the copies of Q1
 167    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
 168    // Mul by Q1WZYX
 169    Q2X = _mm_mul_ps(Q2X,Q1Shuffle);
 170    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(2,3,0,1));
 171    // Flip the signs on y and z
 172    Q2X = _mm_mul_ps(Q2X,ControlWZYX);
 173    // Mul by Q1ZWXY
 174    Q2Y = _mm_mul_ps(Q2Y,Q1Shuffle);
 175    Q1Shuffle = XM_PERMUTE_PS(Q1Shuffle,_MM_SHUFFLE(0,1,2,3));
 176    // Flip the signs on z and w
 177    Q2Y = _mm_mul_ps(Q2Y,ControlZWXY);
 178    // Mul by Q1YXWZ
 179    Q2Z = _mm_mul_ps(Q2Z,Q1Shuffle);
 180    vResult = _mm_add_ps(vResult,Q2X);
 181    // Flip the signs on x and w
 182    Q2Z = _mm_mul_ps(Q2Z,ControlYXWZ);
 183    Q2Y = _mm_add_ps(Q2Y,Q2Z);
 184    vResult = _mm_add_ps(vResult,Q2Y);
 185    return vResult;
 186#else // _XM_VMX128_INTRINSICS_
 187#endif // _XM_VMX128_INTRINSICS_
 188}
 189
 190//------------------------------------------------------------------------------
 191
 192inline XMVECTOR XMQuaternionLengthSq
 193(
 194    FXMVECTOR Q
 195)
 196{
 197    return XMVector4LengthSq(Q);
 198}
 199
 200//------------------------------------------------------------------------------
 201
 202inline XMVECTOR XMQuaternionReciprocalLength
 203(
 204    FXMVECTOR Q
 205)
 206{
 207    return XMVector4ReciprocalLength(Q);
 208}
 209
 210//------------------------------------------------------------------------------
 211
 212inline XMVECTOR XMQuaternionLength
 213(
 214    FXMVECTOR Q
 215)
 216{
 217    return XMVector4Length(Q);
 218}
 219
 220//------------------------------------------------------------------------------
 221
 222inline XMVECTOR XMQuaternionNormalizeEst
 223(
 224    FXMVECTOR Q
 225)
 226{
 227    return XMVector4NormalizeEst(Q);
 228}
 229
 230//------------------------------------------------------------------------------
 231
 232inline XMVECTOR XMQuaternionNormalize
 233(
 234    FXMVECTOR Q
 235)
 236{
 237    return XMVector4Normalize(Q);
 238}
 239
 240//------------------------------------------------------------------------------
 241
 242inline XMVECTOR XMQuaternionConjugate
 243(
 244    FXMVECTOR Q
 245)
 246{
 247#if defined(_XM_NO_INTRINSICS_)
 248    XMVECTOR Result = {
 249        -Q.vector4_f32[0],
 250        -Q.vector4_f32[1],
 251        -Q.vector4_f32[2],
 252        Q.vector4_f32[3]
 253    };
 254    return Result;
 255#elif defined(_XM_ARM_NEON_INTRINSICS_)
 256    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
 257    return vmulq_f32(Q, NegativeOne3.v );
 258#elif defined(_XM_SSE_INTRINSICS_)
 259    static const XMVECTORF32 NegativeOne3 = {-1.0f,-1.0f,-1.0f,1.0f};
 260    return _mm_mul_ps(Q,NegativeOne3);
 261#else // _XM_VMX128_INTRINSICS_
 262#endif // _XM_VMX128_INTRINSICS_
 263}
 264
 265//------------------------------------------------------------------------------
 266
 267inline XMVECTOR XMQuaternionInverse
 268(
 269    FXMVECTOR Q
 270)
 271{
 272#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 273
 274    const XMVECTOR  Zero = XMVectorZero();
 275
 276    XMVECTOR L = XMVector4LengthSq(Q);
 277    XMVECTOR Conjugate = XMQuaternionConjugate(Q);
 278
 279    XMVECTOR Control = XMVectorLessOrEqual(L, g_XMEpsilon.v);
 280
 281    XMVECTOR Result = XMVectorDivide(Conjugate, L);
 282
 283    Result = XMVectorSelect(Result, Zero, Control);
 284
 285    return Result;
 286
 287#else // _XM_VMX128_INTRINSICS_
 288#endif // _XM_VMX128_INTRINSICS_
 289}
 290
 291//------------------------------------------------------------------------------
 292
 293inline XMVECTOR XMQuaternionLn
 294(
 295    FXMVECTOR Q
 296)
 297{
 298#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 299
 300    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
 301
 302    XMVECTOR QW = XMVectorSplatW(Q);
 303    XMVECTOR Q0 = XMVectorSelect(g_XMSelect1110.v, Q, g_XMSelect1110.v);
 304
 305    XMVECTOR ControlW = XMVectorInBounds(QW, OneMinusEpsilon.v);
 306
 307    XMVECTOR Theta = XMVectorACos(QW);
 308    XMVECTOR SinTheta = XMVectorSin(Theta);
 309
 310    XMVECTOR S = XMVectorDivide(Theta,SinTheta);
 311
 312    XMVECTOR Result = XMVectorMultiply(Q0, S);
 313    Result = XMVectorSelect(Q0, Result, ControlW);
 314
 315    return Result;
 316
 317#else // _XM_VMX128_INTRINSICS_
 318#endif // _XM_VMX128_INTRINSICS_
 319}
 320
 321//------------------------------------------------------------------------------
 322
 323inline XMVECTOR XMQuaternionExp
 324(
 325    FXMVECTOR Q
 326)
 327{
 328#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 329
 330    XMVECTOR Theta = XMVector3Length(Q);
 331
 332    XMVECTOR SinTheta, CosTheta;
 333    XMVectorSinCos(&SinTheta, &CosTheta, Theta);
 334
 335    XMVECTOR S = XMVectorDivide(SinTheta, Theta);
 336
 337    XMVECTOR Result = XMVectorMultiply(Q, S);
 338
 339    const XMVECTOR Zero = XMVectorZero();
 340    XMVECTOR Control = XMVectorNearEqual(Theta, Zero, g_XMEpsilon.v);
 341    Result = XMVectorSelect(Result, Q, Control);
 342
 343    Result = XMVectorSelect(CosTheta, Result, g_XMSelect1110.v);
 344
 345    return Result;
 346
 347#else // _XM_VMX128_INTRINSICS_
 348#endif // _XM_VMX128_INTRINSICS_
 349}
 350
 351//------------------------------------------------------------------------------
 352
 353inline XMVECTOR XMQuaternionSlerp
 354(
 355    FXMVECTOR Q0,
 356    FXMVECTOR Q1,
 357    float    t
 358)
 359{
 360    XMVECTOR T = XMVectorReplicate(t);
 361    return XMQuaternionSlerpV(Q0, Q1, T);
 362}
 363
 364//------------------------------------------------------------------------------
 365
 366inline XMVECTOR XMQuaternionSlerpV
 367(
 368    FXMVECTOR Q0,
 369    FXMVECTOR Q1,
 370    FXMVECTOR T
 371)
 372{
 373    assert((XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)));
 374
 375    // Result = Q0 * sin((1.0 - t) * Omega) / sin(Omega) + Q1 * sin(t * Omega) / sin(Omega)
 376
 377#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 378
 379    const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
 380
 381    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
 382
 383    const XMVECTOR Zero = XMVectorZero();
 384    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
 385    XMVECTOR Sign = XMVectorSelect(g_XMOne.v, g_XMNegativeOne.v, Control);
 386
 387    CosOmega = XMVectorMultiply(CosOmega, Sign);
 388
 389    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
 390
 391    XMVECTOR SinOmega = XMVectorNegativeMultiplySubtract(CosOmega, CosOmega, g_XMOne.v);
 392    SinOmega = XMVectorSqrt(SinOmega);
 393
 394    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
 395
 396    XMVECTOR SignMask = XMVectorSplatSignMask();
 397    XMVECTOR V01 = XMVectorShiftLeft(T, Zero, 2);
 398    SignMask = XMVectorShiftLeft(SignMask, Zero, 3);
 399    V01 = XMVectorXorInt(V01, SignMask);
 400    V01 = XMVectorAdd(g_XMIdentityR0.v, V01);
 401
 402    XMVECTOR InvSinOmega = XMVectorReciprocal(SinOmega);
 403
 404    XMVECTOR S0 = XMVectorMultiply(V01, Omega);
 405    S0 = XMVectorSin(S0);
 406    S0 = XMVectorMultiply(S0, InvSinOmega);
 407
 408    S0 = XMVectorSelect(V01, S0, Control);
 409
 410    XMVECTOR S1 = XMVectorSplatY(S0);
 411    S0 = XMVectorSplatX(S0);
 412
 413    S1 = XMVectorMultiply(S1, Sign);
 414
 415    XMVECTOR Result = XMVectorMultiply(Q0, S0);
 416    Result = XMVectorMultiplyAdd(Q1, S1, Result);
 417
 418    return Result;
 419
 420#elif defined(_XM_SSE_INTRINSICS_)
 421    static const XMVECTORF32 OneMinusEpsilon = {1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f, 1.0f - 0.00001f};
 422    static const XMVECTORI32 SignMask2 = {0x80000000,0x00000000,0x00000000,0x00000000};
 423    static const XMVECTORI32 MaskXY = {0xFFFFFFFF,0xFFFFFFFF,0x00000000,0x00000000};
 424
 425    XMVECTOR CosOmega = XMQuaternionDot(Q0, Q1);
 426
 427    const XMVECTOR Zero = XMVectorZero();
 428    XMVECTOR Control = XMVectorLess(CosOmega, Zero);
 429    XMVECTOR Sign = XMVectorSelect(g_XMOne, g_XMNegativeOne, Control);
 430
 431    CosOmega = _mm_mul_ps(CosOmega, Sign);
 432
 433    Control = XMVectorLess(CosOmega, OneMinusEpsilon);
 434
 435    XMVECTOR SinOmega = _mm_mul_ps(CosOmega,CosOmega);
 436    SinOmega = _mm_sub_ps(g_XMOne,SinOmega);
 437    SinOmega = _mm_sqrt_ps(SinOmega);
 438
 439    XMVECTOR Omega = XMVectorATan2(SinOmega, CosOmega);
 440
 441    XMVECTOR V01 = XM_PERMUTE_PS(T,_MM_SHUFFLE(2,3,0,1));
 442    V01 = _mm_and_ps(V01,MaskXY);
 443    V01 = _mm_xor_ps(V01,SignMask2);
 444    V01 = _mm_add_ps(g_XMIdentityR0, V01);
 445
 446    XMVECTOR S0 = _mm_mul_ps(V01, Omega);
 447    S0 = XMVectorSin(S0);
 448    S0 = _mm_div_ps(S0, SinOmega);
 449
 450    S0 = XMVectorSelect(V01, S0, Control);
 451
 452    XMVECTOR S1 = XMVectorSplatY(S0);
 453    S0 = XMVectorSplatX(S0);
 454
 455    S1 = _mm_mul_ps(S1, Sign);
 456    XMVECTOR Result = _mm_mul_ps(Q0, S0);
 457    S1 = _mm_mul_ps(S1, Q1);
 458    Result = _mm_add_ps(Result,S1);
 459    return Result;
 460#else // _XM_VMX128_INTRINSICS_
 461#endif // _XM_VMX128_INTRINSICS_
 462}
 463
 464//------------------------------------------------------------------------------
 465
 466inline XMVECTOR XMQuaternionSquad
 467(
 468    FXMVECTOR Q0,
 469    FXMVECTOR Q1,
 470    FXMVECTOR Q2,
 471    GXMVECTOR Q3,
 472    float    t
 473)
 474{
 475    XMVECTOR T = XMVectorReplicate(t);
 476    return XMQuaternionSquadV(Q0, Q1, Q2, Q3, T);
 477}
 478
 479//------------------------------------------------------------------------------
 480
 481inline XMVECTOR XMQuaternionSquadV
 482(
 483    FXMVECTOR Q0,
 484    FXMVECTOR Q1,
 485    FXMVECTOR Q2,
 486    GXMVECTOR Q3,
 487    CXMVECTOR T
 488)
 489{
 490    assert( (XMVectorGetY(T) == XMVectorGetX(T)) && (XMVectorGetZ(T) == XMVectorGetX(T)) && (XMVectorGetW(T) == XMVectorGetX(T)) );
 491
 492    XMVECTOR TP = T;
 493    const XMVECTOR Two = XMVectorSplatConstant(2, 0);
 494
 495    XMVECTOR Q03 = XMQuaternionSlerpV(Q0, Q3, T);
 496    XMVECTOR Q12 = XMQuaternionSlerpV(Q1, Q2, T);
 497
 498    TP = XMVectorNegativeMultiplySubtract(TP, TP, TP);
 499    TP = XMVectorMultiply(TP, Two);
 500
 501    XMVECTOR Result = XMQuaternionSlerpV(Q03, Q12, TP);
 502
 503    return Result;
 504}
 505
 506//------------------------------------------------------------------------------
 507_Use_decl_annotations_
 508inline void XMQuaternionSquadSetup
 509(
 510    XMVECTOR* pA,
 511    XMVECTOR* pB,
 512    XMVECTOR* pC,
 513    FXMVECTOR  Q0,
 514    FXMVECTOR  Q1,
 515    FXMVECTOR  Q2,
 516    GXMVECTOR  Q3
 517)
 518{
 519    assert(pA);
 520    assert(pB);
 521    assert(pC);
 522
 523    XMVECTOR LS12 = XMQuaternionLengthSq(XMVectorAdd(Q1, Q2));
 524    XMVECTOR LD12 = XMQuaternionLengthSq(XMVectorSubtract(Q1, Q2));
 525    XMVECTOR SQ2 = XMVectorNegate(Q2);
 526
 527    XMVECTOR Control1 = XMVectorLess(LS12, LD12);
 528    SQ2 = XMVectorSelect(Q2, SQ2, Control1);
 529
 530    XMVECTOR LS01 = XMQuaternionLengthSq(XMVectorAdd(Q0, Q1));
 531    XMVECTOR LD01 = XMQuaternionLengthSq(XMVectorSubtract(Q0, Q1));
 532    XMVECTOR SQ0 = XMVectorNegate(Q0);
 533
 534    XMVECTOR LS23 = XMQuaternionLengthSq(XMVectorAdd(SQ2, Q3));
 535    XMVECTOR LD23 = XMQuaternionLengthSq(XMVectorSubtract(SQ2, Q3));
 536    XMVECTOR SQ3 = XMVectorNegate(Q3);
 537
 538    XMVECTOR Control0 = XMVectorLess(LS01, LD01);
 539    XMVECTOR Control2 = XMVectorLess(LS23, LD23);
 540
 541    SQ0 = XMVectorSelect(Q0, SQ0, Control0);
 542    SQ3 = XMVectorSelect(Q3, SQ3, Control2);
 543
 544    XMVECTOR InvQ1 = XMQuaternionInverse(Q1);
 545    XMVECTOR InvQ2 = XMQuaternionInverse(SQ2);
 546
 547    XMVECTOR LnQ0 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ0));
 548    XMVECTOR LnQ2 = XMQuaternionLn(XMQuaternionMultiply(InvQ1, SQ2));
 549    XMVECTOR LnQ1 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, Q1));
 550    XMVECTOR LnQ3 = XMQuaternionLn(XMQuaternionMultiply(InvQ2, SQ3));
 551
 552    const XMVECTOR NegativeOneQuarter = XMVectorSplatConstant(-1, 2);
 553
 554    XMVECTOR ExpQ02 = XMVectorMultiply(XMVectorAdd(LnQ0, LnQ2), NegativeOneQuarter);
 555    XMVECTOR ExpQ13 = XMVectorMultiply(XMVectorAdd(LnQ1, LnQ3), NegativeOneQuarter);
 556    ExpQ02 = XMQuaternionExp(ExpQ02);
 557    ExpQ13 = XMQuaternionExp(ExpQ13);
 558
 559    *pA = XMQuaternionMultiply(Q1, ExpQ02);
 560    *pB = XMQuaternionMultiply(SQ2, ExpQ13);
 561    *pC = SQ2;
 562}
 563
 564//------------------------------------------------------------------------------
 565
 566inline XMVECTOR XMQuaternionBaryCentric
 567(
 568    FXMVECTOR Q0,
 569    FXMVECTOR Q1,
 570    FXMVECTOR Q2,
 571    float    f,
 572    float    g
 573)
 574{
 575    float s = f + g;
 576
 577    XMVECTOR Result;
 578    if ((s < 0.00001f) && (s > -0.00001f))
 579    {
 580        Result = Q0;
 581    }
 582    else
 583    {
 584        XMVECTOR Q01 = XMQuaternionSlerp(Q0, Q1, s);
 585        XMVECTOR Q02 = XMQuaternionSlerp(Q0, Q2, s);
 586
 587        Result = XMQuaternionSlerp(Q01, Q02, g / s);
 588    }
 589
 590    return Result;
 591}
 592
 593//------------------------------------------------------------------------------
 594
 595inline XMVECTOR XMQuaternionBaryCentricV
 596(
 597    FXMVECTOR Q0,
 598    FXMVECTOR Q1,
 599    FXMVECTOR Q2,
 600    GXMVECTOR F,
 601    CXMVECTOR G
 602)
 603{
 604    assert( (XMVectorGetY(F) == XMVectorGetX(F)) && (XMVectorGetZ(F) == XMVectorGetX(F)) && (XMVectorGetW(F) == XMVectorGetX(F)) );
 605    assert( (XMVectorGetY(G) == XMVectorGetX(G)) && (XMVectorGetZ(G) == XMVectorGetX(G)) && (XMVectorGetW(G) == XMVectorGetX(G)) );
 606
 607    const XMVECTOR Epsilon = XMVectorSplatConstant(1, 16);
 608
 609    XMVECTOR S = XMVectorAdd(F, G);
 610
 611    XMVECTOR Result;
 612    if (XMVector4InBounds(S, Epsilon))
 613    {
 614        Result = Q0;
 615    }
 616    else
 617    {
 618        XMVECTOR Q01 = XMQuaternionSlerpV(Q0, Q1, S);
 619        XMVECTOR Q02 = XMQuaternionSlerpV(Q0, Q2, S);
 620        XMVECTOR GS = XMVectorReciprocal(S);
 621        GS = XMVectorMultiply(G, GS);
 622
 623        Result = XMQuaternionSlerpV(Q01, Q02, GS);
 624    }
 625
 626    return Result;
 627}
 628
 629//------------------------------------------------------------------------------
 630// Transformation operations
 631//------------------------------------------------------------------------------
 632
 633//------------------------------------------------------------------------------
 634
 635inline XMVECTOR XMQuaternionIdentity()
 636{
 637#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 638    return g_XMIdentityR3.v;
 639#else // _XM_VMX128_INTRINSICS_
 640#endif // _XM_VMX128_INTRINSICS_
 641}
 642
 643//------------------------------------------------------------------------------
 644
 645inline XMVECTOR XMQuaternionRotationRollPitchYaw
 646(
 647    float Pitch,
 648    float Yaw,
 649    float Roll
 650)
 651{
 652    XMVECTOR Angles = XMVectorSet(Pitch, Yaw, Roll, 0.0f);
 653    XMVECTOR Q = XMQuaternionRotationRollPitchYawFromVector(Angles);
 654    return Q;
 655}
 656
 657//------------------------------------------------------------------------------
 658
 659inline XMVECTOR XMQuaternionRotationRollPitchYawFromVector
 660(
 661    FXMVECTOR Angles // <Pitch, Yaw, Roll, 0>
 662)
 663{
 664#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 665
 666    static const XMVECTORF32  Sign = {1.0f, -1.0f, -1.0f, 1.0f};
 667
 668    XMVECTOR HalfAngles = XMVectorMultiply(Angles, g_XMOneHalf.v);
 669
 670    XMVECTOR SinAngles, CosAngles;
 671    XMVectorSinCos(&SinAngles, &CosAngles, HalfAngles);
 672
 673    XMVECTOR P0 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(SinAngles, CosAngles);
 674    XMVECTOR Y0 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(SinAngles, CosAngles);
 675    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(SinAngles, CosAngles);
 676    XMVECTOR P1 = XMVectorPermute<XM_PERMUTE_0X, XM_PERMUTE_1X, XM_PERMUTE_1X, XM_PERMUTE_1X>(CosAngles, SinAngles);
 677    XMVECTOR Y1 = XMVectorPermute<XM_PERMUTE_1Y, XM_PERMUTE_0Y, XM_PERMUTE_1Y, XM_PERMUTE_1Y>(CosAngles, SinAngles);
 678    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1Z, XM_PERMUTE_1Z, XM_PERMUTE_0Z, XM_PERMUTE_1Z>(CosAngles, SinAngles);
 679
 680    XMVECTOR Q1 = XMVectorMultiply(P1, Sign.v);
 681    XMVECTOR Q0 = XMVectorMultiply(P0, Y0);
 682    Q1 = XMVectorMultiply(Q1, Y1);
 683    Q0 = XMVectorMultiply(Q0, R0);
 684    XMVECTOR Q = XMVectorMultiplyAdd(Q1, R1, Q0);
 685
 686    return Q;
 687
 688#else // _XM_VMX128_INTRINSICS_
 689#endif // _XM_VMX128_INTRINSICS_
 690}
 691
 692//------------------------------------------------------------------------------
 693
 694inline XMVECTOR XMQuaternionRotationNormal
 695(
 696    FXMVECTOR NormalAxis,
 697    float    Angle
 698)
 699{
 700#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 701
 702    XMVECTOR N = XMVectorSelect(g_XMOne.v, NormalAxis, g_XMSelect1110.v);
 703
 704    float SinV, CosV;
 705    XMScalarSinCos(&SinV, &CosV, 0.5f * Angle);
 706
 707    XMVECTOR Scale = XMVectorSet( SinV, SinV, SinV, CosV );
 708    return XMVectorMultiply(N, Scale);
 709#elif defined(_XM_SSE_INTRINSICS_)
 710    XMVECTOR N = _mm_and_ps(NormalAxis,g_XMMask3);
 711    N = _mm_or_ps(N,g_XMIdentityR3);
 712    XMVECTOR Scale = _mm_set_ps1(0.5f * Angle);
 713    XMVECTOR vSine;
 714    XMVECTOR vCosine;
 715    XMVectorSinCos(&vSine,&vCosine,Scale);
 716    Scale = _mm_and_ps(vSine,g_XMMask3);
 717    vCosine = _mm_and_ps(vCosine,g_XMMaskW);
 718    Scale = _mm_or_ps(Scale,vCosine);
 719    N = _mm_mul_ps(N,Scale);
 720    return N;
 721#else // _XM_VMX128_INTRINSICS_
 722#endif // _XM_VMX128_INTRINSICS_
 723}
 724
 725//------------------------------------------------------------------------------
 726
 727inline XMVECTOR XMQuaternionRotationAxis
 728(
 729    FXMVECTOR Axis,
 730    float    Angle
 731)
 732{
 733    assert(!XMVector3Equal(Axis, XMVectorZero()));
 734    assert(!XMVector3IsInfinite(Axis));
 735
 736#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 737    XMVECTOR Normal = XMVector3Normalize(Axis);
 738    XMVECTOR Q = XMQuaternionRotationNormal(Normal, Angle);
 739    return Q;
 740#else // _XM_VMX128_INTRINSICS_
 741#endif // _XM_VMX128_INTRINSICS_
 742}
 743
 744//------------------------------------------------------------------------------
 745
 746inline XMVECTOR XMQuaternionRotationMatrix
 747(
 748    CXMMATRIX M
 749)
 750{
 751#if defined(_XM_NO_INTRINSICS_)
 752
 753    XMVECTORF32 q;
 754    float r22 = M.m[2][2];
 755    if (r22 <= 0.f)  // x^2 + y^2 >= z^2 + w^2
 756    {
 757        float dif10 = M.m[1][1] - M.m[0][0];
 758        float omr22 = 1.f - r22;
 759        if (dif10 <= 0.f)  // x^2 >= y^2
 760        {
 761            float fourXSqr = omr22 - dif10;
 762            float inv4x = 0.5f / sqrtf(fourXSqr);
 763            q.f[0] = fourXSqr*inv4x;
 764            q.f[1] = (M.m[0][1] + M.m[1][0])*inv4x;
 765            q.f[2] = (M.m[0][2] + M.m[2][0])*inv4x;
 766            q.f[3] = (M.m[1][2] - M.m[2][1])*inv4x;
 767        }
 768        else  // y^2 >= x^2
 769        {
 770            float fourYSqr = omr22 + dif10;
 771            float inv4y = 0.5f / sqrtf(fourYSqr);
 772            q.f[0] = (M.m[0][1] + M.m[1][0])*inv4y;
 773            q.f[1] = fourYSqr*inv4y;
 774            q.f[2] = (M.m[1][2] + M.m[2][1])*inv4y;
 775            q.f[3] = (M.m[2][0] - M.m[0][2])*inv4y;
 776        }
 777    }
 778    else  // z^2 + w^2 >= x^2 + y^2
 779    {
 780        float sum10 = M.m[1][1] + M.m[0][0];
 781        float opr22 = 1.f + r22;
 782        if (sum10 <= 0.f)  // z^2 >= w^2
 783        {
 784            float fourZSqr = opr22 - sum10;
 785            float inv4z = 0.5f / sqrtf(fourZSqr);
 786            q.f[0] = (M.m[0][2] + M.m[2][0])*inv4z;
 787            q.f[1] = (M.m[1][2] + M.m[2][1])*inv4z;
 788            q.f[2] = fourZSqr*inv4z;
 789            q.f[3] = (M.m[0][1] - M.m[1][0])*inv4z;
 790        }
 791        else  // w^2 >= z^2
 792        {
 793            float fourWSqr = opr22 + sum10;
 794            float inv4w = 0.5f / sqrtf(fourWSqr);
 795            q.f[0] = (M.m[1][2] - M.m[2][1])*inv4w;
 796            q.f[1] = (M.m[2][0] - M.m[0][2])*inv4w;
 797            q.f[2] = (M.m[0][1] - M.m[1][0])*inv4w;
 798            q.f[3] = fourWSqr*inv4w;
 799        }
 800    }
 801    return q.v;
 802
 803#elif defined(_XM_ARM_NEON_INTRINSICS_)
 804    static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f};
 805    static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f};
 806    static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; 
 807    static const XMVECTORU32 Select0110 = { XM_SELECT_0, XM_SELECT_1, XM_SELECT_1, XM_SELECT_0 };
 808    static const XMVECTORU32 Select0010 = { XM_SELECT_0, XM_SELECT_0, XM_SELECT_1, XM_SELECT_0 };
 809
 810    XMVECTOR r0 = M.r[0];
 811    XMVECTOR r1 = M.r[1];
 812    XMVECTOR r2 = M.r[2];
 813
 814    XMVECTOR r00 = vdupq_lane_f32(vget_low_f32(r0), 0);
 815    XMVECTOR r11 = vdupq_lane_f32(vget_low_f32(r1), 1);
 816    XMVECTOR r22 = vdupq_lane_f32(vget_high_f32(r2), 0);
 817
 818    // x^2 >= y^2 equivalent to r11 - r00 <= 0
 819    XMVECTOR r11mr00 = vsubq_f32(r11, r00);
 820    XMVECTOR x2gey2 = vcleq_f32(r11mr00, g_XMZero);
 821
 822    // z^2 >= w^2 equivalent to r11 + r00 <= 0
 823    XMVECTOR r11pr00 = vaddq_f32(r11, r00);
 824    XMVECTOR z2gew2 = vcleq_f32(r11pr00, g_XMZero);
 825    
 826    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
 827    XMVECTOR x2py2gez2pw2 = vcleq_f32(r22, g_XMZero);
 828
 829    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
 830    XMVECTOR t0 = vmulq_f32( XMPMMP, r00 );
 831    XMVECTOR x2y2z2w2 = vmlaq_f32( t0, XMMPMP, r11 );
 832    x2y2z2w2 = vmlaq_f32( x2y2z2w2, XMMMPP, r22 );
 833    x2y2z2w2 = vaddq_f32( x2y2z2w2, g_XMOne );
 834
 835    // (r01, r02, r12, r11)
 836    t0 = vextq_f32(r0, r0, 1);
 837    XMVECTOR t1 = vextq_f32(r1, r1, 1);
 838    t0 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_low_f32( t1 ) ) );
 839
 840    // (r10, r20, r21, r10)
 841    t1 = vextq_f32(r2, r2, 3);
 842    XMVECTOR r10 = vdupq_lane_f32( vget_low_f32(r1), 0 );
 843    t1 = vbslq_f32( Select0110, t1, r10 );
 844
 845    // (4*x*y, 4*x*z, 4*y*z, unused)
 846    XMVECTOR xyxzyz = vaddq_f32(t0, t1);
 847
 848    // (r21, r20, r10, r10)
 849    t0 = vcombine_f32( vrev64_f32( vget_low_f32(r2) ), vget_low_f32(r10) );
 850
 851    // (r12, r02, r01, r12)
 852    XMVECTOR t2 = vcombine_f32( vrev64_f32( vget_high_f32(r0) ), vrev64_f32( vget_low_f32(r0) ) );
 853    XMVECTOR t3 = vdupq_lane_f32( vget_high_f32(r1), 0 );
 854    t1 = vbslq_f32( Select0110, t2, t3 );
 855
 856    // (4*x*w, 4*y*w, 4*z*w, unused)
 857    XMVECTOR xwywzw = vsubq_f32(t0, t1);
 858    xwywzw = vmulq_f32(XMMPMP, xwywzw);
 859
 860    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
 861    t0 = vextq_f32( xyxzyz, xyxzyz, 3 );
 862    t1 = vbslq_f32( Select0110, t0, x2y2z2w2 );
 863    t2 = vdupq_lane_f32( vget_low_f32(xwywzw), 0 );
 864    XMVECTOR tensor0 = vbslq_f32( g_XMSelect1110, t1, t2 );
 865
 866    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
 867    t0 = vbslq_f32( g_XMSelect1011, xyxzyz, x2y2z2w2 );
 868    t1 = vdupq_lane_f32( vget_low_f32(xwywzw), 1 );
 869    XMVECTOR tensor1 = vbslq_f32( g_XMSelect1110, t0, t1 );
 870
 871    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
 872    t0 = vextq_f32(xyxzyz, xyxzyz, 1);
 873    t1 = vcombine_f32( vget_low_f32(t0), vrev64_f32( vget_high_f32(xwywzw) ) );
 874    XMVECTOR tensor2 = vbslq_f32( Select0010, x2y2z2w2, t1 );
 875
 876    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
 877    XMVECTOR tensor3 = vbslq_f32( g_XMSelect1110, xwywzw, x2y2z2w2 );
 878
 879    // Select the row of the tensor-product matrix that has the largest
 880    // magnitude.
 881    t0 = vbslq_f32( x2gey2, tensor0, tensor1 );
 882    t1 = vbslq_f32( z2gew2, tensor2, tensor3 );
 883    t2 = vbslq_f32( x2py2gez2pw2, t0, t1 );
 884
 885    // Normalize the row.  No division by zero is possible because the
 886    // quaternion is unit-length (and the row is a nonzero multiple of
 887    // the quaternion).
 888    t0 = XMVector4Length(t2);
 889    return XMVectorDivide(t2, t0);
 890#elif defined(_XM_SSE_INTRINSICS_)
 891    static const XMVECTORF32 XMPMMP = {+1.0f, -1.0f, -1.0f, +1.0f};
 892    static const XMVECTORF32 XMMPMP = {-1.0f, +1.0f, -1.0f, +1.0f};
 893    static const XMVECTORF32 XMMMPP = {-1.0f, -1.0f, +1.0f, +1.0f}; 
 894
 895    XMVECTOR r0 = M.r[0];  // (r00, r01, r02, 0)
 896    XMVECTOR r1 = M.r[1];  // (r10, r11, r12, 0)
 897    XMVECTOR r2 = M.r[2];  // (r20, r21, r22, 0)
 898
 899    // (r00, r00, r00, r00)
 900    XMVECTOR r00 = XM_PERMUTE_PS(r0, _MM_SHUFFLE(0,0,0,0));
 901    // (r11, r11, r11, r11)
 902    XMVECTOR r11 = XM_PERMUTE_PS(r1, _MM_SHUFFLE(1,1,1,1));
 903    // (r22, r22, r22, r22)
 904    XMVECTOR r22 = XM_PERMUTE_PS(r2, _MM_SHUFFLE(2,2,2,2));
 905
 906    // x^2 >= y^2 equivalent to r11 - r00 <= 0
 907    // (r11 - r00, r11 - r00, r11 - r00, r11 - r00)
 908    XMVECTOR r11mr00 = _mm_sub_ps(r11, r00);
 909    XMVECTOR x2gey2 = _mm_cmple_ps(r11mr00, g_XMZero);
 910
 911    // z^2 >= w^2 equivalent to r11 + r00 <= 0
 912    // (r11 + r00, r11 + r00, r11 + r00, r11 + r00)
 913    XMVECTOR r11pr00 = _mm_add_ps(r11, r00);
 914    XMVECTOR z2gew2 = _mm_cmple_ps(r11pr00, g_XMZero);
 915
 916    // x^2 + y^2 >= z^2 + w^2 equivalent to r22 <= 0
 917    XMVECTOR x2py2gez2pw2 = _mm_cmple_ps(r22, g_XMZero);
 918
 919    // (+r00, -r00, -r00, +r00)
 920    XMVECTOR t0 = _mm_mul_ps(XMPMMP, r00);
 921
 922    // (-r11, +r11, -r11, +r11)
 923    XMVECTOR t1 = _mm_mul_ps(XMMPMP, r11);
 924
 925    // (-r22, -r22, +r22, +r22)
 926    XMVECTOR t2 = _mm_mul_ps(XMMMPP, r22);
 927
 928    // (4*x^2, 4*y^2, 4*z^2, 4*w^2)
 929    XMVECTOR x2y2z2w2 = _mm_add_ps(t0, t1);
 930    x2y2z2w2 = _mm_add_ps(t2, x2y2z2w2);
 931    x2y2z2w2 = _mm_add_ps(x2y2z2w2, g_XMOne);
 932
 933    // (r01, r02, r12, r11)
 934    t0 = _mm_shuffle_ps(r0, r1, _MM_SHUFFLE(1,2,2,1));
 935    // (r10, r10, r20, r21)
 936    t1 = _mm_shuffle_ps(r1, r2, _MM_SHUFFLE(1,0,0,0));
 937    // (r10, r20, r21, r10)
 938    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
 939    // (4*x*y, 4*x*z, 4*y*z, unused)
 940    XMVECTOR xyxzyz = _mm_add_ps(t0, t1);
 941
 942    // (r21, r20, r10, r10)
 943    t0 = _mm_shuffle_ps(r2, r1, _MM_SHUFFLE(0,0,0,1));
 944    // (r12, r12, r02, r01)
 945    t1 = _mm_shuffle_ps(r1, r0, _MM_SHUFFLE(1,2,2,2));
 946    // (r12, r02, r01, r12)
 947    t1 = XM_PERMUTE_PS(t1, _MM_SHUFFLE(1,3,2,0));
 948    // (4*x*w, 4*y*w, 4*z*w, unused)
 949    XMVECTOR xwywzw = _mm_sub_ps(t0, t1);
 950    xwywzw = _mm_mul_ps(XMMPMP, xwywzw);
 951
 952    // (4*x^2, 4*y^2, 4*x*y, unused)
 953    t0 = _mm_shuffle_ps(x2y2z2w2, xyxzyz, _MM_SHUFFLE(0,0,1,0));
 954    // (4*z^2, 4*w^2, 4*z*w, unused)
 955    t1 = _mm_shuffle_ps(x2y2z2w2, xwywzw, _MM_SHUFFLE(0,2,3,2));
 956    // (4*x*z, 4*y*z, 4*x*w, 4*y*w)
 957    t2 = _mm_shuffle_ps(xyxzyz, xwywzw, _MM_SHUFFLE(1,0,2,1));
 958
 959    // (4*x*x, 4*x*y, 4*x*z, 4*x*w)
 960    XMVECTOR tensor0 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,0,2,0));
 961    // (4*y*x, 4*y*y, 4*y*z, 4*y*w)
 962    XMVECTOR tensor1 = _mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,1,1,2));
 963    // (4*z*x, 4*z*y, 4*z*z, 4*z*w)
 964    XMVECTOR tensor2 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,0,1,0));
 965    // (4*w*x, 4*w*y, 4*w*z, 4*w*w)
 966    XMVECTOR tensor3 = _mm_shuffle_ps(t2, t1, _MM_SHUFFLE(1,2,3,2));
 967
 968    // Select the row of the tensor-product matrix that has the largest
 969    // magnitude.
 970    t0 = _mm_and_ps(x2gey2, tensor0);
 971    t1 = _mm_andnot_ps(x2gey2, tensor1);
 972    t0 = _mm_or_ps(t0, t1);
 973    t1 = _mm_and_ps(z2gew2, tensor2);
 974    t2 = _mm_andnot_ps(z2gew2, tensor3);
 975    t1 = _mm_or_ps(t1, t2);
 976    t0 = _mm_and_ps(x2py2gez2pw2, t0);
 977    t1 = _mm_andnot_ps(x2py2gez2pw2, t1);
 978    t2 = _mm_or_ps(t0, t1);
 979
 980    // Normalize the row.  No division by zero is possible because the
 981    // quaternion is unit-length (and the row is a nonzero multiple of
 982    // the quaternion).
 983    t0 = XMVector4Length(t2);
 984    return _mm_div_ps(t2, t0);
 985#else // _XM_VMX128_INTRINSICS_
 986#endif // _XM_VMX128_INTRINSICS_
 987}
 988
 989//------------------------------------------------------------------------------
 990// Conversion operations
 991//------------------------------------------------------------------------------
 992
 993//------------------------------------------------------------------------------
 994_Use_decl_annotations_
 995inline void XMQuaternionToAxisAngle
 996(
 997    XMVECTOR* pAxis,
 998    float*    pAngle,
 999    FXMVECTOR  Q
1000)
1001{
1002    assert(pAxis);
1003    assert(pAngle);
1004
1005    *pAxis = Q;
1006
1007    *pAngle = 2.0f * XMScalarACos(XMVectorGetW(Q));
1008}
1009
1010/****************************************************************************
1011 *
1012 * Plane
1013 *
1014 ****************************************************************************/
1015
1016//------------------------------------------------------------------------------
1017// Comparison operations
1018//------------------------------------------------------------------------------
1019
1020//------------------------------------------------------------------------------
1021
1022inline bool XMPlaneEqual
1023(
1024    FXMVECTOR P1,
1025    FXMVECTOR P2
1026)
1027{
1028    return XMVector4Equal(P1, P2);
1029}
1030
1031//------------------------------------------------------------------------------
1032
1033inline bool XMPlaneNearEqual
1034(
1035    FXMVECTOR P1,
1036    FXMVECTOR P2,
1037    FXMVECTOR Epsilon
1038)
1039{
1040    XMVECTOR NP1 = XMPlaneNormalize(P1);
1041    XMVECTOR NP2 = XMPlaneNormalize(P2);
1042    return XMVector4NearEqual(NP1, NP2, Epsilon);
1043}
1044
1045//------------------------------------------------------------------------------
1046
1047inline bool XMPlaneNotEqual
1048(
1049    FXMVECTOR P1,
1050    FXMVECTOR P2
1051)
1052{
1053    return XMVector4NotEqual(P1, P2);
1054}
1055
1056//------------------------------------------------------------------------------
1057
1058inline bool XMPlaneIsNaN
1059(
1060    FXMVECTOR P
1061)
1062{
1063    return XMVector4IsNaN(P);
1064}
1065
1066//------------------------------------------------------------------------------
1067
1068inline bool XMPlaneIsInfinite
1069(
1070    FXMVECTOR P
1071)
1072{
1073    return XMVector4IsInfinite(P);
1074}
1075
1076//------------------------------------------------------------------------------
1077// Computation operations
1078//------------------------------------------------------------------------------
1079
1080//------------------------------------------------------------------------------
1081
1082inline XMVECTOR XMPlaneDot
1083(
1084    FXMVECTOR P,
1085    FXMVECTOR V
1086)
1087{
1088    return XMVector4Dot(P, V);
1089}
1090
1091//------------------------------------------------------------------------------
1092
1093inline XMVECTOR XMPlaneDotCoord
1094(
1095    FXMVECTOR P,
1096    FXMVECTOR V
1097)
1098{
1099    // Result = P[0] * V[0] + P[1] * V[1] + P[2] * V[2] + P[3]
1100
1101#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1102
1103    XMVECTOR V3 = XMVectorSelect(g_XMOne.v, V, g_XMSelect1110.v);
1104    XMVECTOR Result = XMVector4Dot(P, V3);
1105    return Result;
1106
1107#else // _XM_VMX128_INTRINSICS_
1108#endif // _XM_VMX128_INTRINSICS_
1109}
1110
1111//------------------------------------------------------------------------------
1112
1113inline XMVECTOR XMPlaneDotNormal
1114(
1115    FXMVECTOR P,
1116    FXMVECTOR V
1117)
1118{
1119    return XMVector3Dot(P, V);
1120}
1121
1122//------------------------------------------------------------------------------
1123// XMPlaneNormalizeEst uses a reciprocal estimate and
1124// returns QNaN on zero and infinite vectors.
1125
1126inline XMVECTOR XMPlaneNormalizeEst
1127(
1128    FXMVECTOR P
1129)
1130{
1131#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1132
1133    XMVECTOR Result = XMVector3ReciprocalLengthEst(P);
1134    return XMVectorMultiply(P, Result);
1135
1136#elif defined(_XM_SSE_INTRINSICS_)
1137    // Perform the dot product
1138    XMVECTOR vDot = _mm_mul_ps(P,P);
1139    // x=Dot.y, y=Dot.z
1140    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
1141    // Result.x = x+y
1142    vDot = _mm_add_ss(vDot,vTemp);
1143    // x=Dot.z
1144    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
1145    // Result.x = (x+y)+z
1146    vDot = _mm_add_ss(vDot,vTemp);
1147    // Splat x
1148    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
1149    // Get the reciprocal
1150    vDot = _mm_rsqrt_ps(vDot);
1151    // Get the reciprocal
1152    vDot = _mm_mul_ps(vDot,P);
1153    return vDot;
1154#else // _XM_VMX128_INTRINSICS_
1155#endif // _XM_VMX128_INTRINSICS_
1156}
1157
1158//------------------------------------------------------------------------------
1159
1160inline XMVECTOR XMPlaneNormalize
1161(
1162    FXMVECTOR P
1163)
1164{
1165#if defined(_XM_NO_INTRINSICS_)
1166    float fLengthSq = sqrtf((P.vector4_f32[0]*P.vector4_f32[0])+(P.vector4_f32[1]*P.vector4_f32[1])+(P.vector4_f32[2]*P.vector4_f32[2]));
1167    // Prevent divide by zero
1168    if (fLengthSq) {
1169        fLengthSq = 1.0f/fLengthSq;
1170    }
1171    {
1172    XMVECTOR vResult = {
1173        P.vector4_f32[0]*fLengthSq,
1174        P.vector4_f32[1]*fLengthSq,
1175        P.vector4_f32[2]*fLengthSq,
1176        P.vector4_f32[3]*fLengthSq
1177    };
1178    return vResult;
1179    }
1180#elif defined(_XM_ARM_NEON_INTRINSICS_)
1181    XMVECTOR vLength = XMVector3ReciprocalLength(P);
1182    return XMVectorMultiply( P, vLength );
1183#elif defined(_XM_SSE_INTRINSICS_)
1184    // Perform the dot product on x,y and z only
1185    XMVECTOR vLengthSq = _mm_mul_ps(P,P);
1186    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
1187    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
1188    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
1189    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
1190    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
1191    // Prepare for the division
1192    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
1193    // Failsafe on zero (Or epsilon) length planes
1194    // If the length is infinity, set the elements to zero
1195    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
1196    // Reciprocal mul to perform the normalization
1197    vResult = _mm_div_ps(P,vResult);
1198    // Any that are infinity, set to zero
1199    vResult = _mm_and_ps(vResult,vLengthSq);
1200    return vResult;
1201#else // _XM_VMX128_INTRINSICS_
1202#endif // _XM_VMX128_INTRINSICS_
1203}
1204
1205//------------------------------------------------------------------------------
1206
1207inline XMVECTOR XMPlaneIntersectLine
1208(
1209    FXMVECTOR P,
1210    FXMVECTOR LinePoint1,
1211    FXMVECTOR LinePoint2
1212)
1213{
1214#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1215
1216    XMVECTOR V1 = XMVector3Dot(P, LinePoint1);
1217    XMVECTOR V2 = XMVector3Dot(P, LinePoint2);
1218    XMVECTOR D = XMVectorSubtract(V1, V2);
1219
1220    XMVECTOR VT = XMPlaneDotCoord(P, LinePoint1);
1221    VT = XMVectorDivide(VT, D);
1222
1223    XMVECTOR Point = XMVectorSubtract(LinePoint2, LinePoint1);
1224    Point = XMVectorMultiplyAdd(Point, VT, LinePoint1);
1225
1226    const XMVECTOR Zero = XMVectorZero();
1227    XMVECTOR Control = XMVectorNearEqual(D, Zero, g_XMEpsilon.v);
1228
1229    return XMVectorSelect(Point, g_XMQNaN.v, Control);
1230
1231#else // _XM_VMX128_INTRINSICS_
1232#endif // _XM_VMX128_INTRINSICS_
1233}
1234
1235//------------------------------------------------------------------------------
1236_Use_decl_annotations_
1237inline void XMPlaneIntersectPlane
1238(
1239    XMVECTOR* pLinePoint1,
1240    XMVECTOR* pLinePoint2,
1241    FXMVECTOR  P1,
1242    FXMVECTOR  P2
1243)
1244{
1245    assert(pLinePoint1);
1246    assert(pLinePoint2);
1247#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1248
1249    XMVECTOR V1 = XMVector3Cross(P2, P1);
1250
1251    XMVECTOR LengthSq = XMVector3LengthSq(V1);
1252
1253    XMVECTOR V2 = XMVector3Cross(P2, V1);
1254
1255    XMVECTOR P1W = XMVectorSplatW(P1);
1256    XMVECTOR Point = XMVectorMultiply(V2, P1W);
1257
1258    XMVECTOR V3 = XMVector3Cross(V1, P1);
1259
1260    XMVECTOR P2W = XMVectorSplatW(P2);
1261    Point = XMVectorMultiplyAdd(V3, P2W, Point);
1262
1263    XMVECTOR LinePoint1 = XMVectorDivide(Point, LengthSq);
1264
1265    XMVECTOR LinePoint2 = XMVectorAdd(LinePoint1, V1);
1266
1267    XMVECTOR Control = XMVectorLessOrEqual(LengthSq, g_XMEpsilon.v);
1268    *pLinePoint1 = XMVectorSelect(LinePoint1,g_XMQNaN.v, Control);
1269    *pLinePoint2 = XMVectorSelect(LinePoint2,g_XMQNaN.v, Control);
1270
1271#else // _XM_VMX128_INTRINSICS_
1272#endif // _XM_VMX128_INTRINSICS_
1273}
1274
1275//------------------------------------------------------------------------------
1276
1277inline XMVECTOR XMPlaneTransform
1278(
1279    FXMVECTOR P,
1280    CXMMATRIX M
1281)
1282{
1283#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1284
1285    XMVECTOR W = XMVectorSplatW(P);
1286    XMVECTOR Z = XMVectorSplatZ(P);
1287    XMVECTOR Y = XMVectorSplatY(P);
1288    XMVECTOR X = XMVectorSplatX(P);
1289
1290    XMVECTOR Result = XMVectorMultiply(W, M.r[3]);
1291    Result = XMVectorMultiplyAdd(Z, M.r[2], Result);
1292    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
1293    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
1294    return Result;
1295
1296#else // _XM_VMX128_INTRINSICS_
1297#endif // _XM_VMX128_INTRINSICS_
1298}
1299
1300//------------------------------------------------------------------------------
1301_Use_decl_annotations_
1302inline XMFLOAT4* XMPlaneTransformStream
1303(
1304    XMFLOAT4*       pOutputStream,
1305    size_t          OutputStride,
1306    const XMFLOAT4* pInputStream,    
1307    size_t          InputStride,
1308    size_t          PlaneCount,
1309    CXMMATRIX       M
1310)
1311{
1312    return XMVector4TransformStream(pOutputStream,
1313                                    OutputStride,
1314                                    pInputStream,
1315                                    InputStride,
1316                                    PlaneCount,
1317                                    M);
1318}
1319
1320//------------------------------------------------------------------------------
1321// Conversion operations
1322//------------------------------------------------------------------------------
1323
1324//------------------------------------------------------------------------------
1325
1326inline XMVECTOR XMPlaneFromPointNormal
1327(
1328    FXMVECTOR Point,
1329    FXMVECTOR Normal
1330)
1331{
1332    XMVECTOR W = XMVector3Dot(Point, Normal);
1333    W = XMVectorNegate(W);
1334    return XMVectorSelect(W, Normal, g_XMSelect1110.v);
1335}
1336
1337//------------------------------------------------------------------------------
1338
1339inline XMVECTOR XMPlaneFromPoints
1340(
1341    FXMVECTOR Point1,
1342    FXMVECTOR Point2,
1343    FXMVECTOR Point3
1344)
1345{
1346#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1347
1348    XMVECTOR V21 = XMVectorSubtract(Point1, Point2);
1349    XMVECTOR V31 = XMVectorSubtract(Point1, Point3);
1350
1351    XMVECTOR N = XMVector3Cross(V21, V31);
1352    N = XMVector3Normalize(N);
1353
1354    XMVECTOR D = XMPlaneDotNormal(N, Point1);
1355    D = XMVectorNegate(D);
1356
1357    XMVECTOR Result = XMVectorSelect(D, N, g_XMSelect1110.v);
1358
1359    return Result;
1360
1361#else // _XM_VMX128_INTRINSICS_
1362#endif // _XM_VMX128_INTRINSICS_
1363}
1364
1365/****************************************************************************
1366 *
1367 * Color
1368 *
1369 ****************************************************************************/
1370
1371//------------------------------------------------------------------------------
1372// Comparison operations
1373//------------------------------------------------------------------------------
1374
1375//------------------------------------------------------------------------------
1376
1377inline bool XMColorEqual
1378(
1379    FXMVECTOR C1,
1380    FXMVECTOR C2
1381)
1382{
1383    return XMVector4Equal(C1, C2);
1384}
1385
1386//------------------------------------------------------------------------------
1387
1388inline bool XMColorNotEqual
1389(
1390    FXMVECTOR C1,
1391    FXMVECTOR C2
1392)
1393{
1394    return XMVector4NotEqual(C1, C2);
1395}
1396
1397//------------------------------------------------------------------------------
1398
1399inline bool XMColorGreater
1400(
1401    FXMVECTOR C1,
1402    FXMVECTOR C2
1403)
1404{
1405    return XMVector4Greater(C1, C2);
1406}
1407
1408//------------------------------------------------------------------------------
1409
1410inline bool XMColorGreaterOrEqual
1411(
1412    FXMVECTOR C1,
1413    FXMVECTOR C2
1414)
1415{
1416    return XMVector4GreaterOrEqual(C1, C2);
1417}
1418
1419//------------------------------------------------------------------------------
1420
1421inline bool XMColorLess
1422(
1423    FXMVECTOR C1,
1424    FXMVECTOR C2
1425)
1426{
1427    return XMVector4Less(C1, C2);
1428}
1429
1430//------------------------------------------------------------------------------
1431
1432inline bool XMColorLessOrEqual
1433(
1434    FXMVECTOR C1,
1435    FXMVECTOR C2
1436)
1437{
1438    return XMVector4LessOrEqual(C1, C2);
1439}
1440
1441//------------------------------------------------------------------------------
1442
1443inline bool XMColorIsNaN
1444(
1445    FXMVECTOR C
1446)
1447{
1448    return XMVector4IsNaN(C);
1449}
1450
1451//------------------------------------------------------------------------------
1452
1453inline bool XMColorIsInfinite
1454(
1455    FXMVECTOR C
1456)
1457{
1458    return XMVector4IsInfinite(C);
1459}
1460
1461//------------------------------------------------------------------------------
1462// Computation operations
1463//------------------------------------------------------------------------------
1464
1465//------------------------------------------------------------------------------
1466
1467inline XMVECTOR XMColorNegative
1468(
1469    FXMVECTOR vColor
1470)
1471{
1472#if defined(_XM_NO_INTRINSICS_)
1473    XMVECTORF32 vResult = {
1474        1.0f - vColor.vector4_f32[0],
1475        1.0f - vColor.vector4_f32[1],
1476        1.0f - vColor.vector4_f32[2],
1477        vColor.vector4_f32[3]
1478    };
1479    return vResult.v;
1480#elif defined(_XM_ARM_NEON_INTRINSICS_)
1481    XMVECTOR vTemp = veorq_u32(vColor,g_XMNegate3);
1482    return vaddq_f32(vTemp,g_XMOne3);
1483#elif defined(_XM_SSE_INTRINSICS_)
1484    // Negate only x,y and z.
1485    XMVECTOR vTemp = _mm_xor_ps(vColor,g_XMNegate3);
1486    // Add 1,1,1,0 to -x,-y,-z,w
1487    return _mm_add_ps(vTemp,g_XMOne3);
1488#else // _XM_VMX128_INTRINSICS_
1489#endif // _XM_VMX128_INTRINSICS_
1490}
1491
1492//------------------------------------------------------------------------------
1493
1494inline XMVECTOR XMColorModulate
1495(
1496    FXMVECTOR C1,
1497    FXMVECTOR C2
1498)
1499{
1500    return XMVectorMultiply(C1, C2);
1501}
1502
1503//------------------------------------------------------------------------------
1504
1505inline XMVECTOR XMColorAdjustSaturation
1506(
1507    FXMVECTOR vColor,
1508    float    fSaturation
1509)
1510{
1511    // Luminance = 0.2125f * C[0] + 0.7154f * C[1] + 0.0721f * C[2];
1512    // Result = (C - Luminance) * Saturation + Luminance;
1513
1514#if defined(_XM_NO_INTRINSICS_)
1515    const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
1516
1517    float fLuminance = (vColor.vector4_f32[0]*gvLuminance.f[0])+(vColor.vector4_f32[1]*gvLuminance.f[1])+(vColor.vector4_f32[2]*gvLuminance.f[2]);
1518    XMVECTORF32 vResult = {
1519        ((vColor.vector4_f32[0] - fLuminance)*fSaturation)+fLuminance,
1520        ((vColor.vector4_f32[1] - fLuminance)*fSaturation)+fLuminance,
1521        ((vColor.vector4_f32[2] - fLuminance)*fSaturation)+fLuminance,
1522        vColor.vector4_f32[3]};
1523    return vResult.v;
1524
1525#elif defined(_XM_ARM_NEON_INTRINSICS_)
1526    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
1527    XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
1528    XMVECTOR vResult = vsubq_f32(vColor, vLuminance);
1529    XMVECTOR vSaturation = vdupq_n_f32(fSaturation);
1530    vResult = vmlaq_f32( vLuminance, vResult, vSaturation );
1531    return vbslq_f32( g_XMSelect1110, vResult, vColor );
1532#elif defined(_XM_SSE_INTRINSICS_)
1533    static const XMVECTORF32 gvLuminance = {0.2125f, 0.7154f, 0.0721f, 0.0f};
1534    XMVECTOR vLuminance = XMVector3Dot( vColor, gvLuminance );
1535// Splat fSaturation
1536    XMVECTOR vSaturation = _mm_set_ps1(fSaturation);
1537// vResult = ((vColor-vLuminance)*vSaturation)+vLuminance;
1538    XMVECTOR vResult = _mm_sub_ps(vColor,vLuminance);
1539    vResult = _mm_mul_ps(vResult,vSaturation);
1540    vResult = _mm_add_ps(vResult,vLuminance);
1541// Retain w from the source color
1542    vLuminance = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
1543    vResult = _mm_shuffle_ps(vResult,vLuminance,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
1544    return vResult;
1545#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1546#endif // _XM_VMX128_INTRINSICS_
1547}
1548
1549//------------------------------------------------------------------------------
1550
1551inline XMVECTOR XMColorAdjustContrast
1552(
1553    FXMVECTOR vColor,
1554    float    fContrast
1555)
1556{
1557    // Result = (vColor - 0.5f) * fContrast + 0.5f;
1558
1559#if defined(_XM_NO_INTRINSICS_)
1560    XMVECTORF32 vResult = {
1561        ((vColor.vector4_f32[0]-0.5f) * fContrast) + 0.5f,
1562        ((vColor.vector4_f32[1]-0.5f) * fContrast) + 0.5f,
1563        ((vColor.vector4_f32[2]-0.5f) * fContrast) + 0.5f,
1564        vColor.vector4_f32[3]        // Leave W untouched
1565    };
1566    return vResult.v;
1567#elif defined(_XM_ARM_NEON_INTRINSICS_)
1568    XMVECTOR vResult = vsubq_f32(vColor, g_XMOneHalf.v);
1569    XMVECTOR vContrast = vdupq_n_f32(fContrast);
1570    vResult = vmlaq_f32( g_XMOneHalf.v, vResult, vContrast );
1571    return vbslq_f32( g_XMSelect1110, vResult, vColor );
1572#elif defined(_XM_SSE_INTRINSICS_)
1573    XMVECTOR vScale = _mm_set_ps1(fContrast);           // Splat the scale
1574    XMVECTOR vResult = _mm_sub_ps(vColor,g_XMOneHalf);  // Subtract 0.5f from the source (Saving source)
1575    vResult = _mm_mul_ps(vResult,vScale);               // Mul by scale
1576    vResult = _mm_add_ps(vResult,g_XMOneHalf);          // Add 0.5f
1577// Retain w from the source color
1578    vScale = _mm_shuffle_ps(vResult,vColor,_MM_SHUFFLE(3,2,2,2));   // x = vResult.z,y = vResult.z,z = vColor.z,w=vColor.w
1579    vResult = _mm_shuffle_ps(vResult,vScale,_MM_SHUFFLE(3,0,1,0));  // x = vResult.x,y = vResult.y,z = vResult.z,w=vColor.w
1580    return vResult;
1581#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1582#endif // _XM_VMX128_INTRINSICS_
1583}
1584
1585//------------------------------------------------------------------------------
1586
1587inline XMVECTOR XMColorRGBToHSL( FXMVECTOR rgb )
1588{
1589    XMVECTOR r = XMVectorSplatX( rgb );
1590    XMVECTOR g = XMVectorSplatY( rgb );
1591    XMVECTOR b = XMVectorSplatZ( rgb );
1592
1593    XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) );
1594    XMVECTOR max = XMVectorMax( r, XMVectorMax( g, b ) );
1595
1596    XMVECTOR l = XMVectorMultiply( XMVectorAdd( min, max ), g_XMOneHalf );
1597
1598    XMVECTOR d = XMVectorSubtract( max, min );
1599
1600    XMVECTOR la = XMVectorSelect( rgb, l, g_XMSelect1110 );
1601
1602    if ( XMVector3Less( d, g_XMEpsilon ) )
1603    {
1604        // Achromatic, assume H and S of 0
1605        return XMVectorSelect( la, g_XMZero, g_XMSelect1100 );
1606    }
1607    else
1608    {
1609        XMVECTOR s, h;
1610
1611        XMVECTOR d2 = XMVectorAdd( min, max );
1612
1613        if ( XMVector3Greater( l, g_XMOneHalf ) )
1614        {
1615            // d / (2-max-min)
1616            s = XMVectorDivide( d, XMVectorSubtract( g_XMTwo, d2 ) ); 
1617        }
1618        else
1619        {
1620            // d / (max+min)
1621            s = XMVectorDivide( d, d2 ); 
1622        }
1623
1624        if ( XMVector3Equal( r, max ) )
1625        {
1626            // Red is max
1627            h = XMVectorDivide( XMVectorSubtract( g, b ), d );
1628        }
1629        else if ( XMVector3Equal( g, max ) )
1630        {
1631            // Green is max
1632            h = XMVectorDivide( XMVectorSubtract( b, r ), d );
1633            h = XMVectorAdd( h, g_XMTwo );
1634        }
1635        else
1636        {
1637            // Blue is max
1638            h = XMVectorDivide( XMVectorSubtract( r, g ), d );
1639            h = XMVectorAdd( h, g_XMFour );
1640        }
1641
1642        h = XMVectorDivide( h, g_XMSix );
1643
1644        if ( XMVector3Less( h, g_XMZero ) )
1645            h = XMVectorAdd( h, g_XMOne );
1646
1647        XMVECTOR lha = XMVectorSelect( la, h, g_XMSelect1100 );
1648        return XMVectorSelect( s, lha, g_XMSelect1011 );
1649    }
1650}
1651
1652//------------------------------------------------------------------------------
1653
1654namespace Internal
1655{
1656
1657inline XMVECTOR XMColorHue2Clr( FXMVECTOR p, FXMVECTOR q, FXMVECTOR h )
1658{
1659    static const XMVECTORF32 oneSixth  = { 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f, 1.0f/6.0f };
1660    static const XMVECTORF32 twoThirds = { 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f };
1661    
1662    XMVECTOR t = h;
1663
1664    if ( XMVector3Less( t, g_XMZero ) )
1665        t = XMVectorAdd( t, g_XMOne );
1666
1667    if ( XMVector3Greater( t, g_XMOne ) )
1668        t = XMVectorSubtract( t, g_XMOne );
1669
1670    if ( XMVector3Less( t, oneSixth ) )
1671    {
1672        // p + (q - p) * 6 * t
1673        XMVECTOR t1 = XMVectorSubtract( q, p );
1674        XMVECTOR t2 = XMVectorMultiply( g_XMSix, t );
1675        return XMVectorMultiplyAdd( t1, t2, p );
1676    }
1677
1678    if ( XMVector3Less( t, g_XMOneHalf ) )
1679        return q;
1680
1681    if ( XMVector3Less( t, twoThirds ) )
1682    {
1683        // p + (q - p) * 6 * (2/3 - t)
1684        XMVECTOR t1 = XMVectorSubtract( q, p );
1685        XMVECTOR t2 = XMVectorMultiply( g_XMSix, XMVectorSubtract( twoThirds, t ) );
1686        return XMVectorMultiplyAdd( t1, t2, p );
1687    }
1688
1689    return p;
1690}
1691
1692}; // namespace Internal
1693
1694inline XMVECTOR XMColorHSLToRGB( FXMVECTOR hsl )
1695{
1696    static const XMVECTORF32 oneThird = { 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f };
1697
1698    XMVECTOR s = XMVectorSplatY( hsl );
1699    XMVECTOR l = XMVectorSplatZ( hsl );
1700
1701    if ( XMVector3NearEqual( s, g_XMZero, g_XMEpsilon ) )
1702    {
1703        // Achromatic
1704        return XMVectorSelect( hsl, l, g_XMSelect1110 );
1705    }
1706    else
1707    {
1708        XMVECTOR h = XMVectorSplatX( hsl );
1709
1710        XMVECTOR q;
1711        if ( XMVector3Less( l, g_XMOneHalf ) )
1712        {
1713            q = XMVectorMultiply( l, XMVectorAdd ( g_XMOne, s ) );
1714        }
1715        else
1716        {
1717            q = XMVectorSubtract( XMVectorAdd( l, s ), XMVectorMultiply( l, s ) );
1718        }
1719
1720        XMVECTOR p = XMVectorSubtract( XMVectorMultiply( g_XMTwo, l ), q );
1721
1722        XMVECTOR r = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorAdd( h, oneThird ) );
1723        XMVECTOR g = DirectX::Internal::XMColorHue2Clr( p, q, h );
1724        XMVECTOR b = DirectX::Internal::XMColorHue2Clr( p, q, XMVectorSubtract( h, oneThird ) );
1725
1726        XMVECTOR rg = XMVectorSelect( g, r, g_XMSelect1000 );
1727        XMVECTOR ba = XMVectorSelect( hsl, b, g_XMSelect1110 );
1728
1729        return XMVectorSelect( ba, rg, g_XMSelect1100 );
1730    }
1731}
1732
1733//------------------------------------------------------------------------------
1734
1735inline XMVECTOR XMColorRGBToHSV( FXMVECTOR rgb )
1736{
1737    XMVECTOR r = XMVectorSplatX( rgb );
1738    XMVECTOR g = XMVectorSplatY( rgb );
1739    XMVECTOR b = XMVectorSplatZ( rgb );
1740
1741    XMVECTOR min = XMVectorMin( r, XMVectorMin( g, b ) );
1742    XMVECTOR v = XMVectorMax( r, XMVectorMax( g, b ) );
1743
1744    XMVECTOR d = XMVectorSubtract( v, min );
1745
1746    XMVECTOR s = ( XMVector3NearEqual( v, g_XMZero, g_XMEpsilon ) ) ? g_XMZero : XMVectorDivide( d, v );
1747
1748    if ( XMVector3Less( d, g_XMEpsilon ) )
1749    {
1750        // Achromatic, assume H of 0
1751        XMVECTOR hv = XMVectorSelect( v, g_XMZero, g_XMSelect1000 );
1752        XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 );
1753        return XMVectorSelect( s, hva, g_XMSelect1011 );
1754    }
1755    else
1756    {
1757        XMVECTOR h;
1758
1759        if ( XMVector3Equal( r, v ) )
1760        {
1761            // Red is max
1762            h = XMVectorDivide( XMVectorSubtract( g, b ), d );
1763
1764            if ( XMVector3Less( g, b ) )
1765                h = XMVectorAdd( h, g_XMSix );
1766        }
1767        else if ( XMVector3Equal( g, v ) )
1768        {
1769            // Green is max
1770            h = XMVectorDivide( XMVectorSubtract( b, r ), d );
1771            h = XMVectorAdd( h, g_XMTwo );
1772        }
1773        else
1774        {
1775            // Blue is max
1776            h = XMVectorDivide( XMVectorSubtract( r, g ), d );
1777            h = XMVectorAdd( h, g_XMFour );
1778        }
1779
1780        h = XMVectorDivide( h, g_XMSix );
1781
1782        XMVECTOR hv = XMVectorSelect( v, h, g_XMSelect1000 );
1783        XMVECTOR hva = XMVectorSelect( rgb, hv, g_XMSelect1110 );
1784        return XMVectorSelect( s, hva, g_XMSelect1011 );
1785    }
1786}
1787
1788//------------------------------------------------------------------------------
1789
1790inline XMVECTOR XMColorHSVToRGB( FXMVECTOR hsv )
1791{
1792    XMVECTOR h = XMVectorSplatX( hsv );
1793    XMVECTOR s = XMVectorSplatY( hsv );
1794    XMVECTOR v = XMVectorSplatZ( hsv );
1795
1796    XMVECTOR h6 = XMVectorMultiply( h, g_XMSix );
1797
1798    XMVECTOR i = XMVectorFloor( h6 );
1799    XMVECTOR f = XMVectorSubtract( h6, i );
1800
1801    // p = v* (1-s)
1802    XMVECTOR p = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, s ) );
1803
1804    // q = v*(1-f*s)
1805    XMVECTOR q = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( f, s ) ) );
1806
1807    // t = v*(1 - (1-f)*s)
1808    XMVECTOR t = XMVectorMultiply( v, XMVectorSubtract( g_XMOne, XMVectorMultiply( XMVectorSubtract( g_XMOne, f ), s ) ) );
1809
1810    int ii = static_cast<int>( XMVectorGetX( XMVectorMod( i, g_XMSix ) ) );
1811
1812    XMVECTOR _rgb;
1813
1814    switch (ii)
1815    {
1816    case 0: // rgb = vtp
1817        {
1818            XMVECTOR vt = XMVectorSelect( t, v, g_XMSelect1000 );
1819            _rgb = XMVectorSelect( p, vt, g_XMSelect1100 );
1820        }
1821        break;
1822    case 1: // rgb = qvp
1823        {
1824            XMVECTOR qv = XMVectorSelect( v, q, g_XMSelect1000 );
1825            _rgb = XMVectorSelect( p, qv, g_XMSelect1100 );
1826        }
1827        break;
1828    case 2: // rgb = pvt
1829        {
1830            XMVECTOR pv = XMVectorSelect( v, p, g_XMSelect1000 );
1831            _rgb = XMVectorSelect( t, pv, g_XMSelect1100 );
1832        }
1833        break;
1834    case 3: // rgb = pqv
1835        {
1836            XMVECTOR pq = XMVectorSelect( q, p, g_XMSelect1000 );
1837            _rgb = XMVectorSelect( v, pq, g_XMSelect1100 );
1838        }
1839        break;
1840    case 4: // rgb = tpv
1841        {
1842            XMVECTOR tp = XMVectorSelect( p, t, g_XMSelect1000 );
1843            _rgb = XMVectorSelect( v, tp, g_XMSelect1100 );
1844        }
1845        break;
1846    default: // rgb = vpq
1847        {
1848            XMVECTOR vp = XMVectorSelect( p, v, g_XMSelect1000 );
1849            _rgb = XMVectorSelect( q, vp, g_XMSelect1100 );
1850        }
1851        break;
1852    }
1853
1854    return XMVectorSelect( hsv, _rgb, g_XMSelect1110 );
1855}
1856
1857//------------------------------------------------------------------------------
1858
1859inline XMVECTOR XMColorRGBToYUV( FXMVECTOR rgb )
1860{
1861    static const XMVECTORF32 Scale0 = {  0.299f, -0.147f,  0.615f, 0.0f }; 
1862    static const XMVECTORF32 Scale1 = {  0.587f, -0.289f, -0.515f, 0.0f };
1863    static const XMVECTORF32 Scale2 = {  0.114f,  0.436f, -0.100f, 0.0f };
1864
1865    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
1866    XMVECTOR clr = XMVector3Transform( rgb, M );
1867
1868    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
1869}
1870
1871//------------------------------------------------------------------------------
1872
1873inline XMVECTOR XMColorYUVToRGB( FXMVECTOR yuv )
1874{
1875    static const XMVECTORF32 Scale1 = {   0.0f, -0.395f, 2.032f, 0.0f };
1876    static const XMVECTORF32 Scale2 = { 1.140f, -0.581f,   0.0f, 0.0f };
1877
1878    XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero );
1879    XMVECTOR clr = XMVector3Transform( yuv, M );
1880
1881    return XMVectorSelect( yuv, clr, g_XMSelect1110 );
1882}
1883
1884//------------------------------------------------------------------------------
1885
1886inline XMVECTOR XMColorRGBToYUV_HD( FXMVECTOR rgb )
1887{
1888    static const XMVECTORF32 Scale0 = { 0.2126f, -0.0997f,  0.6150f, 0.0f };
1889    static const XMVECTORF32 Scale1 = { 0.7152f, -0.3354f, -0.5586f, 0.0f };
1890    static const XMVECTORF32 Scale2 = { 0.0722f,  0.4351f, -0.0564f, 0.0f };
1891
1892    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
1893    XMVECTOR clr = XMVector3Transform( rgb, M );
1894
1895    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
1896}
1897
1898//------------------------------------------------------------------------------
1899
1900inline XMVECTOR XMColorYUVToRGB_HD( FXMVECTOR yuv )
1901{
1902    static const XMVECTORF32 Scale1 = {    0.0f, -0.2153f, 2.1324f, 0.0f };
1903    static const XMVECTORF32 Scale2 = { 1.2803f, -0.3806f,    0.0f, 0.0f };
1904        
1905    XMMATRIX M( g_XMOne, Scale1, Scale2, g_XMZero );
1906    XMVECTOR clr = XMVector3Transform( yuv, M );
1907
1908    return XMVectorSelect( yuv, clr, g_XMSelect1110 );
1909}
1910
1911//------------------------------------------------------------------------------
1912
1913inline XMVECTOR XMColorRGBToXYZ( FXMVECTOR rgb )
1914{
1915    static const XMVECTORF32 Scale0 = { 0.4887180f, 0.1762044f, 0.0000000f, 0.0f };
1916    static const XMVECTORF32 Scale1 = { 0.3106803f, 0.8129847f, 0.0102048f, 0.0f };
1917    static const XMVECTORF32 Scale2 = { 0.2006017f, 0.0108109f, 0.9897952f, 0.0f };
1918    static const XMVECTORF32 Scale = { 1.f/0.17697f, 1.f/0.17697f, 1.f/0.17697f, 0.0f };
1919
1920    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
1921    XMVECTOR clr = XMVectorMultiply( XMVector3Transform( rgb, M ), Scale );
1922
1923    return XMVectorSelect( rgb, clr, g_XMSelect1110 );
1924}
1925
1926inline XMVECTOR XMColorXYZToRGB( FXMVECTOR xyz )
1927{
1928    static const XMVECTORF32 Scale0 = {  2.3706743f, -0.5138850f,  0.0052982f, 0.0f };
1929    static const XMVECTORF32 Scale1 = { -0.9000405f,  1.4253036f, -0.0146949f, 0.0f };
1930    static const XMVECTORF32 Scale2 = { -0.4706338f,  0.0885814f,  1.0093968f, 0.0f };
1931    static const XMVECTORF32 Scale = { 0.17697f, 0.17697f, 0.17697f, 0.0f };
1932
1933    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
1934    XMVECTOR clr = XMVector3Transform( XMVectorMultiply( xyz, Scale ), M );
1935
1936    return XMVectorSelect( xyz, clr, g_XMSelect1110 );
1937}
1938
1939//------------------------------------------------------------------------------
1940
1941inline XMVECTOR XMColorXYZToSRGB( FXMVECTOR xyz )
1942{
1943    static const XMVECTORF32 Scale0 = {  3.2406f, -0.9689f,  0.0557f, 0.0f };
1944    static const XMVECTORF32 Scale1 = { -1.5372f,  1.8758f, -0.2040f, 0.0f };
1945    static const XMVECTORF32 Scale2 = { -0.4986f,  0.0415f,  1.0570f, 0.0f };
1946    static const XMVECTORF32 Cutoff = { 0.0031308f, 0.0031308f, 0.0031308f, 0.0f };
1947    static const XMVECTORF32 Exp    = { 1.0f/2.4f, 1.0f/2.4f, 1.0f/2.4f, 1.0f };
1948
1949    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
1950    XMVECTOR lclr = XMVector3Transform( xyz, M );
1951
1952    XMVECTOR sel = XMVectorGreater( lclr, Cutoff );
1953
1954    // clr = 12.92 * lclr for lclr <= 0.0031308f
1955    XMVECTOR smallC = XMVectorMultiply( lclr, g_XMsrgbScale );
1956
1957    // clr = (1+a)*pow(lclr, 1/2.4) - a for lclr > 0.0031308 (where a = 0.055)
1958    XMVECTOR largeC = XMVectorSubtract( XMVectorMultiply( g_XMsrgbA1, XMVectorPow( lclr, Exp ) ), g_XMsrgbA );
1959
1960    XMVECTOR clr = XMVectorSelect( smallC, largeC, sel );
1961
1962    return XMVectorSelect( xyz, clr, g_XMSelect1110 );
1963}
1964
1965//------------------------------------------------------------------------------
1966
1967inline XMVECTOR XMColorSRGBToXYZ( FXMVECTOR srgb )
1968{
1969    static const XMVECTORF32 Scale0 = { 0.4124f, 0.2126f, 0.0193f, 0.0f };
1970    static const XMVECTORF32 Scale1 = { 0.3576f, 0.7152f, 0.1192f, 0.0f };
1971    static const XMVECTORF32 Scale2 = { 0.1805f, 0.0722f, 0.9505f, 0.0f };
1972    static const XMVECTORF32 Cutoff = { 0.04045f, 0.04045f, 0.04045f, 0.0f };
1973    static const XMVECTORF32 Exp    = { 2.4f, 2.4f, 2.4f, 1.0f };
1974
1975    XMVECTOR sel = XMVectorGreater( srgb, Cutoff );
1976
1977    // lclr = clr / 12.92
1978    XMVECTOR smallC = XMVectorDivide( srgb, g_XMsrgbScale );
1979
1980    // lclr = pow( (clr + a) / (1+a), 2.4 )
1981    XMVECTOR largeC = XMVectorPow( XMVectorDivide( XMVectorAdd( srgb, g_XMsrgbA ), g_XMsrgbA1 ), Exp );
1982
1983    XMVECTOR lclr = XMVectorSelect( smallC, largeC, sel );
1984
1985    XMMATRIX M( Scale0, Scale1, Scale2, g_XMZero );
1986    XMVECTOR clr = XMVector3Transform( lclr, M );
1987
1988    return XMVectorSelect( srgb, clr, g_XMSelect1110 );
1989}
1990
1991/****************************************************************************
1992 *
1993 * Miscellaneous
1994 *
1995 ****************************************************************************/
1996
1997//------------------------------------------------------------------------------
1998
1999inline bool XMVerifyCPUSupport()
2000{
2001#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2002#if defined(_M_AMD64)
2003    // The X64 processor model requires SSE2 support
2004    return true;
2005#elif defined(PF_XMMI_INSTRUCTIONS_AVAILABLE)
2006    // Note that on Windows 2000 or older, SSE2 detection is not supported so this will always fail
2007    // Detecting SSE2 on older versions of Windows would require using cpuid directly
2008    return ( IsProcessorFeaturePresent( PF_XMMI_INSTRUCTIONS_AVAILABLE ) != 0 && IsProcessorFeaturePresent( PF_XMMI64_INSTRUCTIONS_AVAILABLE ) != 0 );
2009#else
2010    // If windows.h is not included, we return false (likely a false negative)
2011    return false;
2012#endif
2013#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2014#ifdef PF_ARM_NEON_INSTRUCTIONS_AVAILABLE
2015    return ( IsProcessorFeaturePresent( PF_ARM_NEON_INSTRUCTIONS_AVAILABLE ) != 0 );
2016#else
2017    // If windows.h is not included, we return false (likely a false negative)
2018    return false;
2019#endif
2020#else
2021    return true;
2022#endif
2023}
2024
2025//------------------------------------------------------------------------------
2026
2027inline XMVECTOR XMFresnelTerm
2028(
2029    FXMVECTOR CosIncidentAngle,
2030    FXMVECTOR RefractionIndex
2031)
2032{
2033    assert(!XMVector4IsInfinite(CosIncidentAngle));
2034
2035    // Result = 0.5f * (g - c)^2 / (g + c)^2 * ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1) where
2036    // c = CosIncidentAngle
2037    // g = sqrt(c^2 + RefractionIndex^2 - 1)
2038
2039#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2040
2041    XMVECTOR G = XMVectorMultiplyAdd(RefractionIndex, RefractionIndex, g_XMNegativeOne.v);
2042    G = XMVectorMultiplyAdd(CosIncidentAngle, CosIncidentAngle, G);
2043    G = XMVectorAbs(G);
2044    G = XMVectorSqrt(G);
2045
2046    XMVECTOR S = XMVectorAdd(G, CosIncidentAngle);
2047    XMVECTOR D = XMVectorSubtract(G, CosIncidentAngle);
2048
2049    XMVECTOR V0 = XMVectorMultiply(D, D);
2050    XMVECTOR V1 = XMVectorMultiply(S, S);
2051    V1 = XMVectorReciprocal(V1);
2052    V0 = XMVectorMultiply(g_XMOneHalf.v, V0);
2053    V0 = XMVectorMultiply(V0, V1);
2054
2055    XMVECTOR V2 = XMVectorMultiplyAdd(CosIncidentAngle, S, g_XMNegativeOne.v);
2056    XMVECTOR V3 = XMVectorMultiplyAdd(CosIncidentAngle, D, g_XMOne.v);
2057    V2 = XMVectorMultiply(V2, V2);
2058    V3 = XMVectorMultiply(V3, V3);
2059    V3 = XMVectorReciprocal(V3);
2060    V2 = XMVectorMultiplyAdd(V2, V3, g_XMOne.v);
2061
2062    XMVECTOR Result = XMVectorMultiply(V0, V2);
2063
2064    Result = XMVectorSaturate(Result);
2065
2066    return Result;
2067
2068#elif defined(_XM_SSE_INTRINSICS_)
2069    // G = sqrt(abs((RefractionIndex^2-1) + CosIncidentAngle^2))
2070    XMVECTOR G = _mm_mul_ps(RefractionIndex,RefractionIndex);
2071    XMVECTOR vTemp = _mm_mul_ps(CosIncidentAngle,CosIncidentAngle);
2072    G = _mm_sub_ps(G,g_XMOne);
2073    vTemp = _mm_add_ps(vTemp,G);
2074    // max((0-vTemp),vTemp) == abs(vTemp)
2075    // The abs is needed to deal with refraction and cosine being zero
2076    G = _mm_setzero_ps();
2077    G = _mm_sub_ps(G,vTemp);
2078    G = _mm_max_ps(G,vTemp);
2079    // Last operation, the sqrt()
2080    G = _mm_sqrt_ps(G);
2081
2082    // Calc G-C and G+C
2083    XMVECTOR GAddC = _mm_add_ps(G,CosIncidentAngle);
2084    XMVECTOR GSubC = _mm_sub_ps(G,CosIncidentAngle);
2085    // Perform the term (0.5f *(g - c)^2) / (g + c)^2 
2086    XMVECTOR vResult = _mm_mul_ps(GSubC,GSubC);
2087    vTemp = _mm_mul_ps(GAddC,GAddC);
2088    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
2089    vResult = _mm_div_ps(vResult,vTemp);
2090    // Perform the term ((c * (g + c) - 1)^2 / (c * (g - c) + 1)^2 + 1)
2091    GAddC = _mm_mul_ps(GAddC,CosIncidentAngle);
2092    GSubC = _mm_mul_ps(GSubC,CosIncidentAngle);
2093    GAddC = _mm_sub_ps(GAddC,g_XMOne);
2094    GSubC = _mm_add_ps(GSubC,g_XMOne);
2095    GAddC = _mm_mul_ps(GAddC,GAddC);
2096    GSubC = _mm_mul_ps(GSubC,GSubC);
2097    GAddC = _mm_div_ps(GAddC,GSubC);
2098    GAddC = _mm_add_ps(GAddC,g_XMOne);
2099    // Multiply the two term parts
2100    vResult = _mm_mul_ps(vResult,GAddC);
2101    // Clamp to 0.0 - 1.0f
2102    vResult = _mm_max_ps(vResult,g_XMZero);
2103    vResult = _mm_min_ps(vResult,g_XMOne);
2104    return vResult;
2105#else // _XM_VMX128_INTRINSICS_
2106#endif // _XM_VMX128_INTRINSICS_
2107}
2108
2109//------------------------------------------------------------------------------
2110
2111inline bool XMScalarNearEqual
2112(
2113    float S1,
2114    float S2,
2115    float Epsilon
2116)
2117{
2118    float Delta = S1 - S2;
2119    return (fabsf(Delta) <= Epsilon);
2120}
2121
2122//------------------------------------------------------------------------------
2123// Modulo the range of the given angle such that -XM_PI <= Angle < XM_PI
2124inline float XMScalarModAngle
2125(
2126    float Angle
2127)
2128{
2129    // Note: The modulo is performed with unsigned math only to work
2130    // around a precision error on numbers that are close to PI
2131
2132    // Normalize the range from 0.0f to XM_2PI
2133    Angle = Angle + XM_PI;
2134    // Perform the modulo, unsigned
2135    float fTemp = fabsf(Angle);
2136    fTemp = fTemp - (XM_2PI * (float)((int32_t)(fTemp/XM_2PI)));
2137    // Restore the number to the range of -XM_PI to XM_PI-epsilon
2138    fTemp = fTemp - XM_PI;
2139    // If the modulo'd value was negative, restore negation
2140    if (Angle<0.0f) {
2141        fTemp = -fTemp;
2142    }
2143    return fTemp;
2144}
2145
2146//------------------------------------------------------------------------------
2147
2148inline float XMScalarSin
2149(
2150    float Value
2151)
2152{
2153    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
2154    float quotient = XM_1DIV2PI*Value;
2155    if (Value >= 0.0f)
2156    {
2157        quotient = (float)((int)(quotient + 0.5f));
2158    }
2159    else
2160    {
2161        quotient = (float)((int)(quotient - 0.5f));
2162    }
2163    float y = Value - XM_2PI*quotient;
2164
2165    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
2166    if (y > XM_PIDIV2)
2167    {
2168        y = XM_PI - y;
2169    }
2170    else if (y < -XM_PIDIV2)
2171    {
2172        y = -XM_PI - y;
2173    }
2174
2175    // 11-degree minimax approximation
2176    float y2 = y * y;
2177    return ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y;
2178}
2179
2180//------------------------------------------------------------------------------
2181
2182inline float XMScalarSinEst
2183(
2184    float Value
2185)
2186{
2187    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
2188    float quotient = XM_1DIV2PI*Value;
2189    if (Value >= 0.0f)
2190    {
2191        quotient = (float)((int)(quotient + 0.5f));
2192    }
2193    else
2194    {
2195        quotient = (float)((int)(quotient - 0.5f));
2196    }
2197    float y = Value - XM_2PI*quotient;
2198
2199    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
2200    if (y > XM_PIDIV2)
2201    {
2202        y = XM_PI - y;
2203    }
2204    else if (y < -XM_PIDIV2)
2205    {
2206        y = -XM_PI - y;
2207    }
2208
2209    // 7-degree minimax approximation
2210    float y2 = y * y;
2211    return ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y;
2212}
2213
2214//------------------------------------------------------------------------------
2215
2216inline float XMScalarCos
2217(
2218    float Value
2219)
2220{
2221    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
2222    float quotient = XM_1DIV2PI*Value;
2223    if (Value >= 0.0f)
2224    {
2225        quotient = (float)((int)(quotient + 0.5f));
2226    }
2227    else
2228    {
2229        quotient = (float)((int)(quotient - 0.5f));
2230    }
2231    float y = Value - XM_2PI*quotient;
2232
2233    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
2234    float sign;
2235    if (y > XM_PIDIV2)
2236    {
2237        y = XM_PI - y;
2238        sign = -1.0f;
2239    }
2240    else if (y < -XM_PIDIV2)
2241    {
2242        y = -XM_PI - y;
2243        sign = -1.0f;
2244    }
2245    else
2246    {
2247        sign = +1.0f;
2248    }
2249
2250    // 10-degree minimax approximation
2251    float y2 = y*y;
2252    float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f;
2253    return sign*p;
2254}
2255
2256//------------------------------------------------------------------------------
2257
2258inline float XMScalarCosEst
2259(
2260    float Value
2261)
2262{
2263    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
2264    float quotient = XM_1DIV2PI*Value;
2265    if (Value >= 0.0f)
2266    {
2267        quotient = (float)((int)(quotient + 0.5f));
2268    }
2269    else
2270    {
2271        quotient = (float)((int)(quotient - 0.5f));
2272    }
2273    float y = Value - XM_2PI*quotient;
2274
2275    // Map y to [-pi/2,pi/2] with cos(y) = sign*cos(x).
2276    float sign;
2277    if (y > XM_PIDIV2)
2278    {
2279        y = XM_PI - y;
2280        sign = -1.0f;
2281    }
2282    else if (y < -XM_PIDIV2)
2283    {
2284        y = -XM_PI - y;
2285        sign = -1.0f;
2286    }
2287    else
2288    {
2289        sign = +1.0f;
2290    }
2291
2292    // 6-degree minimax approximation
2293    float y2 = y * y;
2294    float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f;
2295    return sign*p;
2296}
2297
2298//------------------------------------------------------------------------------
2299
2300_Use_decl_annotations_
2301inline void XMScalarSinCos
2302(
2303    float* pSin,
2304    float* pCos,
2305    float  Value
2306)
2307{
2308    assert(pSin);
2309    assert(pCos);
2310
2311    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
2312    float quotient = XM_1DIV2PI*Value;
2313    if (Value >= 0.0f)
2314    {
2315        quotient = (float)((int)(quotient + 0.5f));
2316    }
2317    else
2318    {
2319        quotient = (float)((int)(quotient - 0.5f));
2320    }
2321    float y = Value - XM_2PI*quotient;
2322
2323    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
2324    float sign;
2325    if (y > XM_PIDIV2)
2326    {
2327        y = XM_PI - y;
2328        sign = -1.0f;
2329    }
2330    else if (y < -XM_PIDIV2)
2331    {
2332        y = -XM_PI - y;
2333        sign = -1.0f;
2334    }
2335    else
2336    {
2337        sign = +1.0f;
2338    }
2339
2340    float y2 = y * y;
2341
2342    // 11-degree minimax approximation
2343    *pSin = ( ( ( ( (-2.3889859e-08f * y2 + 2.7525562e-06f) * y2 - 0.00019840874f ) * y2 + 0.0083333310f ) * y2 - 0.16666667f ) * y2 + 1.0f ) * y;
2344
2345    // 10-degree minimax approximation
2346    float p = ( ( ( ( -2.6051615e-07f * y2 + 2.4760495e-05f ) * y2 - 0.0013888378f ) * y2 + 0.041666638f ) * y2 - 0.5f ) * y2 + 1.0f;
2347    *pCos = sign*p;
2348}
2349
2350//------------------------------------------------------------------------------
2351
2352_Use_decl_annotations_
2353inline void XMScalarSinCosEst
2354(
2355    float* pSin,
2356    float* pCos,
2357    float  Value
2358)
2359{
2360    assert(pSin);
2361    assert(pCos);
2362
2363    // Map Value to y in [-pi,pi], x = 2*pi*quotient + remainder.
2364    float quotient = XM_1DIV2PI*Value;
2365    if (Value >= 0.0f)
2366    {
2367        quotient = (float)((int)(quotient + 0.5f));
2368    }
2369    else
2370    {
2371        quotient = (float)((int)(quotient - 0.5f));
2372    }
2373    float y = Value - XM_2PI*quotient;
2374
2375    // Map y to [-pi/2,pi/2] with sin(y) = sin(Value).
2376    float sign;
2377    if (y > XM_PIDIV2)
2378    {
2379        y = XM_PI - y;
2380        sign = -1.0f;
2381    }
2382    else if (y < -XM_PIDIV2)
2383    {
2384        y = -XM_PI - y;
2385        sign = -1.0f;
2386    }
2387    else
2388    {
2389        sign = +1.0f;
2390    }
2391
2392    float y2 = y * y;
2393
2394    // 7-degree minimax approximation
2395    *pSin = ( ( ( -0.00018524670f * y2 + 0.0083139502f ) * y2 - 0.16665852f ) * y2 + 1.0f ) * y;
2396
2397    // 6-degree minimax approximation
2398    float p = ( ( -0.0012712436f * y2 + 0.041493919f ) * y2 - 0.49992746f ) * y2 + 1.0f;
2399    *pCos = sign*p;
2400}
2401
2402//------------------------------------------------------------------------------
2403
2404inline float XMScalarASin
2405(
2406    float Value
2407)
2408{
2409    // Clamp input to [-1,1].
2410    bool nonnegative = (Value >= 0.0f);
2411    float x = fabsf(Value);
2412    float omx = 1.0f - x;
2413    if (omx < 0.0f)
2414    {
2415        omx = 0.0f;
2416    }
2417    float root = sqrt(omx);
2418
2419    // 7-degree minimax approximation
2420    float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f;
2421    result *= root;  // acos(|x|)
2422
2423    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
2424    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
2425}
2426
2427//------------------------------------------------------------------------------
2428
2429inline float XMScalarASinEst
2430(
2431    float Value
2432)
2433{
2434    // Clamp input to [-1,1].
2435    bool nonnegative = (Value >= 0.0f);
2436    float x = fabsf(Value);
2437    float omx = 1.0f - x;
2438    if (omx < 0.0f)
2439    {
2440        omx = 0.0f;
2441    }
2442    float root = sqrt(omx);
2443
2444    // 3-degree minimax approximation
2445    float result = ((-0.0187293f*x+0.0742610f)*x-0.2121144f)*x+1.5707288f;
2446    result *= root;  // acos(|x|)
2447
2448    // acos(x) = pi - acos(-x) when x < 0, asin(x) = pi/2 - acos(x)
2449    return (nonnegative ? XM_PIDIV2 - result : result - XM_PIDIV2);
2450}
2451
2452//------------------------------------------------------------------------------
2453
2454inline float XMScalarACos
2455(
2456    float Value
2457)
2458{
2459    // Clamp input to [-1,1].
2460    bool nonnegative = (Value >= 0.0f);
2461    float x = fabsf(Value);
2462    float omx = 1.0f - x;
2463    if (omx < 0.0f)
2464    {
2465        omx = 0.0f;
2466    }
2467    float root = sqrtf(omx);
2468
2469    // 7-degree minimax approximation
2470    float result = ( ( ( ( ( ( -0.0012624911f * x + 0.0066700901f ) * x - 0.0170881256f ) * x + 0.0308918810f ) * x - 0.0501743046f ) * x + 0.0889789874f ) * x - 0.2145988016f ) * x + 1.5707963050f;
2471    result *= root;
2472
2473    // acos(x) = pi - acos(-x) when x < 0
2474    return (nonnegative ? result : XM_PI - result);
2475}
2476
2477//------------------------------------------------------------------------------
2478
2479inline float XMScalarACosEst
2480(
2481    float Value
2482)
2483{
2484    // Clamp input to [-1,1].
2485    bool nonnegative = (Value >= 0.0f);
2486    float x = fabsf(Value);
2487    float omx = 1.0f - x;
2488    if (omx < 0.0f)
2489    {
2490        omx = 0.0f;
2491    }
2492    float root = sqrtf(omx);
2493
2494    // 3-degree minimax approximation
2495    float result = ( ( -0.0187293f * x + 0.0742610f ) * x - 0.2121144f ) * x + 1.5707288f;
2496    result *= root;
2497
2498    // acos(x) = pi - acos(-x) when x < 0
2499    return (nonnegative ? result : XM_PI - result);
2500}
2501