Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathVector.inl at master

ocrm.bsky.social / VoxelBlockGame
fork atom
the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
fork atom
VoxelBlockGame / Minecraft.Client / PS3 / PS3Extras / DirectX / DirectXMathVector.inl
at master 10596 lines 338 kB view raw
wrap content
daoge_cmd Initial commit 17d ago
b691c43c
    1//-------------------------------------------------------------------------------------
    2// DirectXMathVector.inl -- SIMD C++ Math library
    3//
    4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
    5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
    6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
    7// PARTICULAR PURPOSE.
    8//  
    9// Copyright (c) Microsoft Corporation. All rights reserved.
   10//-------------------------------------------------------------------------------------
   11
   12#ifdef _MSC_VER
   13#pragma once
   14#endif
   15
   16#if defined(_XM_NO_INTRINSICS_)
   17#define XMISNAN(x)  ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0)
   18#define XMISINF(x)  ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000)
   19#endif
   20
   21/****************************************************************************
   22 *
   23 * General Vector
   24 *
   25 ****************************************************************************/
   26
   27//------------------------------------------------------------------------------
   28// Assignment operations
   29//------------------------------------------------------------------------------
   30
   31//------------------------------------------------------------------------------
   32// Return a vector with all elements equaling zero
   33inline XMVECTOR XMVectorZero()
   34{
   35#if defined(_XM_NO_INTRINSICS_)
   36    XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
   37    return vResult;
   38#elif defined(_XM_ARM_NEON_INTRINSICS_)
   39    return vdupq_n_u32(0);
   40#elif defined(_XM_SSE_INTRINSICS_)
   41    return _mm_setzero_ps();
   42#else // _XM_VMX128_INTRINSICS_
   43#endif // _XM_VMX128_INTRINSICS_
   44}
   45
   46//------------------------------------------------------------------------------
   47// Initialize a vector with four floating point values
   48inline XMVECTOR XMVectorSet
   49(
   50    float x, 
   51    float y, 
   52    float z, 
   53    float w
   54)
   55{
   56#if defined(_XM_NO_INTRINSICS_)
   57    XMVECTORF32 vResult = {x,y,z,w};
   58    return vResult.v;
   59#elif defined(_XM_ARM_NEON_INTRINSICS_)
   60    __n64 V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32));
   61    __n64 V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32));
   62    return vcombine_f32(V0, V1);
   63#elif defined(_XM_SSE_INTRINSICS_)
   64    return _mm_set_ps( w, z, y, x );
   65#else // _XM_VMX128_INTRINSICS_
   66#endif // _XM_VMX128_INTRINSICS_
   67}
   68
   69//------------------------------------------------------------------------------
   70// Initialize a vector with four integer values
   71inline XMVECTOR XMVectorSetInt
   72(
   73    uint32_t x, 
   74    uint32_t y, 
   75    uint32_t z, 
   76    uint32_t w
   77)
   78{
   79#if defined(_XM_NO_INTRINSICS_)
   80    XMVECTORU32 vResult = {x,y,z,w};
   81    return vResult.v;
   82#elif defined(_XM_ARM_NEON_INTRINSICS_)
   83    __n64 V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32));
   84    __n64 V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32));
   85    return vcombine_u32(V0, V1);
   86#elif defined(_XM_SSE_INTRINSICS_)
   87    __m128i V = _mm_set_epi32( w, z, y, x );
   88    return reinterpret_cast<__m128 *>(&V)[0];
   89#else // _XM_VMX128_INTRINSICS_
   90#endif // _XM_VMX128_INTRINSICS_
   91}
   92
   93//------------------------------------------------------------------------------
   94// Initialize a vector with a replicated floating point value
   95inline XMVECTOR XMVectorReplicate
   96(
   97    float Value
   98)
   99{
  100#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  101    XMVECTORF32 vResult = {Value,Value,Value,Value};
  102    return vResult.v;
  103#elif defined(_XM_ARM_NEON_INTRINSICS_)
  104    return vdupq_n_f32( Value );
  105#elif defined(_XM_SSE_INTRINSICS_)
  106    return _mm_set_ps1( Value );
  107#else // _XM_VMX128_INTRINSICS_
  108#endif // _XM_VMX128_INTRINSICS_
  109}
  110
  111//------------------------------------------------------------------------------
  112// Initialize a vector with a replicated floating point value passed by pointer
  113_Use_decl_annotations_
  114inline XMVECTOR XMVectorReplicatePtr
  115(
  116    const float *pValue
  117)
  118{
  119#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  120    float Value = pValue[0];
  121    XMVECTORF32 vResult = {Value,Value,Value,Value};
  122    return vResult.v;
  123#elif defined(_XM_ARM_NEON_INTRINSICS_)
  124    return vld1q_dup_f32( pValue );
  125#elif defined(_XM_SSE_INTRINSICS_)
  126    return _mm_load_ps1( pValue );
  127#else // _XM_VMX128_INTRINSICS_
  128#endif // _XM_VMX128_INTRINSICS_
  129}
  130
  131//------------------------------------------------------------------------------
  132// Initialize a vector with a replicated integer value
  133inline XMVECTOR XMVectorReplicateInt
  134(
  135    uint32_t Value
  136)
  137{
  138#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  139    XMVECTORU32 vResult = {Value,Value,Value,Value};
  140    return vResult.v;
  141#elif defined(_XM_ARM_NEON_INTRINSICS_)
  142    return vdupq_n_u32( Value );
  143#elif defined(_XM_SSE_INTRINSICS_)
  144    __m128i vTemp = _mm_set1_epi32( Value );
  145    return _mm_castsi128_ps(vTemp);
  146#else // _XM_VMX128_INTRINSICS_
  147#endif // _XM_VMX128_INTRINSICS_
  148}
  149
  150//------------------------------------------------------------------------------
  151// Initialize a vector with a replicated integer value passed by pointer
  152_Use_decl_annotations_
  153inline XMVECTOR XMVectorReplicateIntPtr
  154(
  155    const uint32_t *pValue
  156)
  157{
  158#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  159    uint32_t Value = pValue[0];
  160    XMVECTORU32 vResult = {Value,Value,Value,Value};
  161    return vResult.v;
  162#elif defined(_XM_ARM_NEON_INTRINSICS_)
  163    return vld1q_dup_u32(pValue);
  164#elif defined(_XM_SSE_INTRINSICS_)
  165    return _mm_load_ps1(reinterpret_cast<const float *>(pValue));
  166#else // _XM_VMX128_INTRINSICS_
  167#endif // _XM_VMX128_INTRINSICS_
  168}
  169
  170//------------------------------------------------------------------------------
  171// Initialize a vector with all bits set (true mask)
  172inline XMVECTOR XMVectorTrueInt()
  173{
  174#if defined(_XM_NO_INTRINSICS_)
  175    XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU};
  176    return vResult.v;
  177#elif defined(_XM_ARM_NEON_INTRINSICS_)
  178    return vdupq_n_s32(-1);
  179#elif defined(_XM_SSE_INTRINSICS_)
  180    __m128i V = _mm_set1_epi32(-1);
  181    return reinterpret_cast<__m128 *>(&V)[0];
  182#else // _XM_VMX128_INTRINSICS_
  183#endif // _XM_VMX128_INTRINSICS_
  184}
  185
  186//------------------------------------------------------------------------------
  187// Initialize a vector with all bits clear (false mask)
  188inline XMVECTOR XMVectorFalseInt()
  189{
  190#if defined(_XM_NO_INTRINSICS_)
  191    XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
  192    return vResult;
  193#elif defined(_XM_ARM_NEON_INTRINSICS_)
  194    return vdupq_n_u32(0);
  195#elif defined(_XM_SSE_INTRINSICS_)
  196    return _mm_setzero_ps();
  197#else // _XM_VMX128_INTRINSICS_
  198#endif // _XM_VMX128_INTRINSICS_
  199}
  200
  201//------------------------------------------------------------------------------
  202// Replicate the x component of the vector
  203inline XMVECTOR XMVectorSplatX
  204(
  205    FXMVECTOR V
  206)
  207{
  208#if defined(_XM_NO_INTRINSICS_)
  209    XMVECTOR vResult;
  210    vResult.vector4_f32[0] = 
  211    vResult.vector4_f32[1] = 
  212    vResult.vector4_f32[2] = 
  213    vResult.vector4_f32[3] = V.vector4_f32[0];
  214    return vResult;
  215#elif defined(_XM_ARM_NEON_INTRINSICS_)
  216    return vdupq_lane_f32( vget_low_f32( V ), 0 );
  217#elif defined(_XM_SSE_INTRINSICS_)
  218    return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
  219#else // _XM_VMX128_INTRINSICS_
  220#endif // _XM_VMX128_INTRINSICS_
  221}
  222
  223//------------------------------------------------------------------------------
  224// Replicate the y component of the vector
  225inline XMVECTOR XMVectorSplatY
  226(
  227    FXMVECTOR V
  228)
  229{
  230#if defined(_XM_NO_INTRINSICS_)
  231    XMVECTOR vResult;
  232    vResult.vector4_f32[0] = 
  233    vResult.vector4_f32[1] = 
  234    vResult.vector4_f32[2] = 
  235    vResult.vector4_f32[3] = V.vector4_f32[1];
  236    return vResult;
  237#elif defined(_XM_ARM_NEON_INTRINSICS_)
  238    return vdupq_lane_f32( vget_low_f32( V ), 1 );
  239#elif defined(_XM_SSE_INTRINSICS_)
  240    return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
  241#else // _XM_VMX128_INTRINSICS_
  242#endif // _XM_VMX128_INTRINSICS_
  243}
  244
  245//------------------------------------------------------------------------------
  246// Replicate the z component of the vector
  247inline XMVECTOR XMVectorSplatZ
  248(
  249    FXMVECTOR V
  250)
  251{
  252#if defined(_XM_NO_INTRINSICS_)
  253    XMVECTOR vResult;
  254    vResult.vector4_f32[0] = 
  255    vResult.vector4_f32[1] = 
  256    vResult.vector4_f32[2] = 
  257    vResult.vector4_f32[3] = V.vector4_f32[2];
  258    return vResult;
  259#elif defined(_XM_ARM_NEON_INTRINSICS_)
  260    return vdupq_lane_f32( vget_high_f32( V ), 0 );
  261#elif defined(_XM_SSE_INTRINSICS_)
  262    return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
  263#else // _XM_VMX128_INTRINSICS_
  264#endif // _XM_VMX128_INTRINSICS_
  265}
  266
  267//------------------------------------------------------------------------------
  268// Replicate the w component of the vector
  269inline XMVECTOR XMVectorSplatW
  270(
  271    FXMVECTOR V
  272)
  273{
  274#if defined(_XM_NO_INTRINSICS_)
  275    XMVECTOR vResult;
  276    vResult.vector4_f32[0] = 
  277    vResult.vector4_f32[1] = 
  278    vResult.vector4_f32[2] = 
  279    vResult.vector4_f32[3] = V.vector4_f32[3];
  280    return vResult;
  281#elif defined(_XM_ARM_NEON_INTRINSICS_)
  282    return vdupq_lane_f32( vget_high_f32( V ), 1 );
  283#elif defined(_XM_SSE_INTRINSICS_)
  284    return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
  285#else // _XM_VMX128_INTRINSICS_
  286#endif // _XM_VMX128_INTRINSICS_
  287}
  288
  289//------------------------------------------------------------------------------
  290// Return a vector of 1.0f,1.0f,1.0f,1.0f
  291inline XMVECTOR XMVectorSplatOne()
  292{
  293#if defined(_XM_NO_INTRINSICS_)
  294    XMVECTOR vResult;
  295    vResult.vector4_f32[0] = 
  296    vResult.vector4_f32[1] = 
  297    vResult.vector4_f32[2] = 
  298    vResult.vector4_f32[3] = 1.0f;
  299    return vResult;
  300#elif defined(_XM_ARM_NEON_INTRINSICS_)
  301    return vdupq_n_f32(1.0f);
  302#elif defined(_XM_SSE_INTRINSICS_)
  303    return g_XMOne;
  304#else //  _XM_VMX128_INTRINSICS_
  305#endif // _XM_VMX128_INTRINSICS_
  306}
  307
  308//------------------------------------------------------------------------------
  309// Return a vector of INF,INF,INF,INF
  310inline XMVECTOR XMVectorSplatInfinity()
  311{
  312#if defined(_XM_NO_INTRINSICS_)
  313    XMVECTOR vResult;
  314    vResult.vector4_u32[0] = 
  315    vResult.vector4_u32[1] = 
  316    vResult.vector4_u32[2] = 
  317    vResult.vector4_u32[3] = 0x7F800000;
  318    return vResult;
  319#elif defined(_XM_ARM_NEON_INTRINSICS_)
  320    return vdupq_n_u32(0x7F800000);
  321#elif defined(_XM_SSE_INTRINSICS_)
  322    return g_XMInfinity;
  323#else // _XM_VMX128_INTRINSICS_
  324#endif // _XM_VMX128_INTRINSICS_
  325}
  326
  327//------------------------------------------------------------------------------
  328// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
  329inline XMVECTOR XMVectorSplatQNaN()
  330{
  331#if defined(_XM_NO_INTRINSICS_)
  332    XMVECTOR vResult;
  333    vResult.vector4_u32[0] = 
  334    vResult.vector4_u32[1] = 
  335    vResult.vector4_u32[2] = 
  336    vResult.vector4_u32[3] = 0x7FC00000;
  337    return vResult;
  338#elif defined(_XM_ARM_NEON_INTRINSICS_)
  339    return vdupq_n_u32(0x7FC00000);
  340#elif defined(_XM_SSE_INTRINSICS_)
  341    return g_XMQNaN;
  342#else // _XM_VMX128_INTRINSICS_
  343#endif // _XM_VMX128_INTRINSICS_
  344}
  345
  346//------------------------------------------------------------------------------
  347// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
  348inline XMVECTOR XMVectorSplatEpsilon()
  349{
  350#if defined(_XM_NO_INTRINSICS_)
  351    XMVECTOR vResult;
  352    vResult.vector4_u32[0] = 
  353    vResult.vector4_u32[1] = 
  354    vResult.vector4_u32[2] = 
  355    vResult.vector4_u32[3] = 0x34000000;
  356    return vResult;
  357#elif defined(_XM_ARM_NEON_INTRINSICS_)
  358    return vdupq_n_u32(0x34000000);
  359#elif defined(_XM_SSE_INTRINSICS_)
  360    return g_XMEpsilon;
  361#else // _XM_VMX128_INTRINSICS_
  362#endif // _XM_VMX128_INTRINSICS_
  363}
  364
  365//------------------------------------------------------------------------------
  366// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
  367inline XMVECTOR XMVectorSplatSignMask()
  368{
  369#if defined(_XM_NO_INTRINSICS_)
  370    XMVECTOR vResult;
  371    vResult.vector4_u32[0] = 
  372    vResult.vector4_u32[1] = 
  373    vResult.vector4_u32[2] = 
  374    vResult.vector4_u32[3] = 0x80000000U;
  375    return vResult;
  376#elif defined(_XM_ARM_NEON_INTRINSICS_)
  377    return vdupq_n_u32(0x80000000U);
  378#elif defined(_XM_SSE_INTRINSICS_)
  379    __m128i V = _mm_set1_epi32( 0x80000000 );
  380    return reinterpret_cast<__m128*>(&V)[0];
  381#else // _XM_VMX128_INTRINSICS_
  382#endif // _XM_VMX128_INTRINSICS_
  383}
  384
  385//------------------------------------------------------------------------------
  386// Return a floating point value via an index. This is not a recommended
  387// function to use due to performance loss.
  388inline float XMVectorGetByIndex(FXMVECTOR V, size_t i)
  389{
  390    assert( i < 4 );
  391    _Analysis_assume_( i < 4 );
  392#if defined(_XM_NO_INTRINSICS_)
  393    return V.vector4_f32[i];
  394#elif defined(_XM_ARM_NEON_INTRINSICS_)
  395    return V.n128_f32[i];
  396#elif defined(_XM_SSE_INTRINSICS_)
  397    return V.m128_f32[i];
  398#else // _XM_VMX128_INTRINSICS_
  399#endif // _XM_VMX128_INTRINSICS_
  400}
  401
  402//------------------------------------------------------------------------------
  403// Return the X component in an FPU register. 
  404inline float XMVectorGetX(FXMVECTOR V)
  405{
  406#if defined(_XM_NO_INTRINSICS_)
  407    return V.vector4_f32[0];
  408#elif defined(_XM_ARM_NEON_INTRINSICS_)
  409    return vgetq_lane_f32(V, 0);
  410#elif defined(_XM_SSE_INTRINSICS_)
  411    return _mm_cvtss_f32(V);
  412#else // _XM_VMX128_INTRINSICS_
  413#endif // _XM_VMX128_INTRINSICS_
  414}
  415
  416// Return the Y component in an FPU register. 
  417inline float XMVectorGetY(FXMVECTOR V)
  418{
  419#if defined(_XM_NO_INTRINSICS_)
  420    return V.vector4_f32[1];
  421#elif defined(_XM_ARM_NEON_INTRINSICS_)
  422    return vgetq_lane_f32(V, 1);
  423#elif defined(_XM_SSE_INTRINSICS_)
  424    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  425    return _mm_cvtss_f32(vTemp);
  426#else // _XM_VMX128_INTRINSICS_
  427#endif // _XM_VMX128_INTRINSICS_
  428}
  429
  430// Return the Z component in an FPU register. 
  431inline float XMVectorGetZ(FXMVECTOR V)
  432{
  433#if defined(_XM_NO_INTRINSICS_)
  434    return V.vector4_f32[2];
  435#elif defined(_XM_ARM_NEON_INTRINSICS_)
  436    return vgetq_lane_f32(V, 2);
  437#elif defined(_XM_SSE_INTRINSICS_)
  438    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  439    return _mm_cvtss_f32(vTemp);
  440#else // _XM_VMX128_INTRINSICS_
  441#endif // _XM_VMX128_INTRINSICS_
  442}
  443
  444// Return the W component in an FPU register. 
  445inline float XMVectorGetW(FXMVECTOR V)
  446{
  447#if defined(_XM_NO_INTRINSICS_)
  448    return V.vector4_f32[3];
  449#elif defined(_XM_ARM_NEON_INTRINSICS_)
  450    return vgetq_lane_f32(V, 3);
  451#elif defined(_XM_SSE_INTRINSICS_)
  452    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  453    return _mm_cvtss_f32(vTemp);
  454#else // _XM_VMX128_INTRINSICS_
  455#endif // _XM_VMX128_INTRINSICS_
  456}
  457
  458//------------------------------------------------------------------------------
  459
  460// Store a component indexed by i into a 32 bit float location in memory.
  461_Use_decl_annotations_
  462inline void XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i)
  463{
  464    assert( f != NULL );
  465    assert( i <  4 );
  466    _Analysis_assume_( i < 4 );
  467#if defined(_XM_NO_INTRINSICS_)
  468    *f = V.vector4_f32[i];
  469#elif defined(_XM_ARM_NEON_INTRINSICS_)
  470    *f = V.n128_f32[i];
  471#elif defined(_XM_SSE_INTRINSICS_)
  472    *f = V.m128_f32[i];
  473#else // _XM_VMX128_INTRINSICS_
  474#endif // _XM_VMX128_INTRINSICS_
  475}
  476
  477//------------------------------------------------------------------------------
  478
  479// Store the X component into a 32 bit float location in memory.
  480_Use_decl_annotations_
  481inline void XMVectorGetXPtr(float *x, FXMVECTOR V)
  482{
  483    assert( x != NULL);
  484#if defined(_XM_NO_INTRINSICS_)
  485    *x = V.vector4_f32[0];
  486#elif defined(_XM_ARM_NEON_INTRINSICS_)
  487    vst1q_lane_f32(x,V,0);
  488#elif defined(_XM_SSE_INTRINSICS_)
  489    _mm_store_ss(x,V);
  490#else // _XM_VMX128_INTRINSICS_
  491#endif // _XM_VMX128_INTRINSICS_
  492}
  493
  494// Store the Y component into a 32 bit float location in memory.
  495_Use_decl_annotations_
  496inline void XMVectorGetYPtr(float *y, FXMVECTOR V)
  497{
  498    assert( y != NULL );
  499#if defined(_XM_NO_INTRINSICS_)
  500    *y = V.vector4_f32[1];
  501#elif defined(_XM_ARM_NEON_INTRINSICS_)
  502    vst1q_lane_f32(y,V,1);
  503#elif defined(_XM_SSE_INTRINSICS_)
  504    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  505    _mm_store_ss(y,vResult);
  506#else // _XM_VMX128_INTRINSICS_
  507#endif // _XM_VMX128_INTRINSICS_
  508}
  509
  510// Store the Z component into a 32 bit float location in memory.
  511_Use_decl_annotations_
  512inline void XMVectorGetZPtr(float *z, FXMVECTOR V)
  513{
  514    assert( z != NULL );
  515#if defined(_XM_NO_INTRINSICS_)
  516    *z = V.vector4_f32[2];
  517#elif defined(_XM_ARM_NEON_INTRINSICS_)
  518    vst1q_lane_f32(z,V,2);
  519#elif defined(_XM_SSE_INTRINSICS_)
  520    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  521    _mm_store_ss(z,vResult);
  522#else // _XM_VMX128_INTRINSICS_
  523#endif // _XM_VMX128_INTRINSICS_
  524}
  525
  526// Store the W component into a 32 bit float location in memory.
  527_Use_decl_annotations_
  528inline void XMVectorGetWPtr(float *w, FXMVECTOR V)
  529{
  530    assert( w != NULL );
  531#if defined(_XM_NO_INTRINSICS_)
  532    *w = V.vector4_f32[3];
  533#elif defined(_XM_ARM_NEON_INTRINSICS_)
  534    vst1q_lane_f32(w,V,3);
  535#elif defined(_XM_SSE_INTRINSICS_)
  536    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  537    _mm_store_ss(w,vResult);
  538#else // _XM_VMX128_INTRINSICS_
  539#endif // _XM_VMX128_INTRINSICS_
  540}
  541
  542//------------------------------------------------------------------------------
  543
  544// Return an integer value via an index. This is not a recommended
  545// function to use due to performance loss.
  546inline uint32_t XMVectorGetIntByIndex(FXMVECTOR V, size_t i)
  547{
  548    assert( i < 4 );
  549    _Analysis_assume_( i < 4 );
  550#if defined(_XM_NO_INTRINSICS_)
  551    return V.vector4_u32[i];
  552#elif defined(_XM_ARM_NEON_INTRINSICS_)
  553    return V.n128_u32[i];
  554#elif defined(_XM_SSE_INTRINSICS_)
  555    return V.m128_u32[i];
  556#else // _XM_VMX128_INTRINSICS_
  557#endif // _XM_VMX128_INTRINSICS_
  558}
  559
  560//------------------------------------------------------------------------------
  561
  562// Return the X component in an integer register. 
  563inline uint32_t XMVectorGetIntX(FXMVECTOR V)
  564{
  565#if defined(_XM_NO_INTRINSICS_)
  566    return V.vector4_u32[0];
  567#elif defined(_XM_ARM_NEON_INTRINSICS_)
  568    return vgetq_lane_u32(V, 0);
  569#elif defined(_XM_SSE_INTRINSICS_)
  570    return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
  571#else // _XM_VMX128_INTRINSICS_
  572#endif // _XM_VMX128_INTRINSICS_
  573}
  574
  575// Return the Y component in an integer register. 
  576inline uint32_t XMVectorGetIntY(FXMVECTOR V)
  577{
  578#if defined(_XM_NO_INTRINSICS_)
  579    return V.vector4_u32[1];
  580#elif defined(_XM_ARM_NEON_INTRINSICS_)
  581    return vgetq_lane_u32(V, 1);
  582#elif defined(_XM_SSE_INTRINSICS_)
  583    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
  584    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
  585#else // _XM_VMX128_INTRINSICS_
  586#endif // _XM_VMX128_INTRINSICS_
  587}
  588
  589// Return the Z component in an integer register. 
  590inline uint32_t XMVectorGetIntZ(FXMVECTOR V)
  591{
  592#if defined(_XM_NO_INTRINSICS_)
  593    return V.vector4_u32[2];
  594#elif defined(_XM_ARM_NEON_INTRINSICS_)
  595    return vgetq_lane_u32(V, 2);
  596#elif defined(_XM_SSE_INTRINSICS_)
  597    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
  598    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
  599#else // _XM_VMX128_INTRINSICS_
  600#endif // _XM_VMX128_INTRINSICS_
  601}
  602
  603// Return the W component in an integer register. 
  604inline uint32_t XMVectorGetIntW(FXMVECTOR V)
  605{
  606#if defined(_XM_NO_INTRINSICS_)
  607    return V.vector4_u32[3];
  608#elif defined(_XM_ARM_NEON_INTRINSICS_)
  609    return vgetq_lane_u32(V, 3);
  610#elif defined(_XM_SSE_INTRINSICS_)
  611    __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
  612    return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
  613#else // _XM_VMX128_INTRINSICS_
  614#endif // _XM_VMX128_INTRINSICS_
  615}
  616
  617//------------------------------------------------------------------------------
  618
  619// Store a component indexed by i into a 32 bit integer location in memory.
  620_Use_decl_annotations_
  621inline void XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i)
  622{
  623    assert( x != NULL );
  624    assert( i <  4 );
  625    _Analysis_assume_( i < 4 );
  626#if defined(_XM_NO_INTRINSICS_)
  627    *x = V.vector4_u32[i];
  628#elif defined(_XM_ARM_NEON_INTRINSICS_)
  629    *x = V.n128_u32[i];
  630#elif defined(_XM_SSE_INTRINSICS_)
  631    *x = V.m128_u32[i];
  632#else // _XM_VMX128_INTRINSICS_
  633#endif // _XM_VMX128_INTRINSICS_
  634}
  635
  636//------------------------------------------------------------------------------
  637
  638// Store the X component into a 32 bit integer location in memory.
  639_Use_decl_annotations_
  640inline void XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V)
  641{
  642    assert( x != NULL );
  643#if defined(_XM_NO_INTRINSICS_)
  644    *x = V.vector4_u32[0];
  645#elif defined(_XM_ARM_NEON_INTRINSICS_)
  646    vst1q_lane_u32(x,V,0);
  647#elif defined(_XM_SSE_INTRINSICS_)
  648    _mm_store_ss(reinterpret_cast<float *>(x),V);
  649#else // _XM_VMX128_INTRINSICS_
  650#endif // _XM_VMX128_INTRINSICS_
  651}
  652
  653// Store the Y component into a 32 bit integer location in memory.
  654_Use_decl_annotations_
  655inline void XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
  656{
  657    assert( y != NULL );
  658#if defined(_XM_NO_INTRINSICS_)
  659    *y = V.vector4_u32[1];
  660#elif defined(_XM_ARM_NEON_INTRINSICS_)
  661    vst1q_lane_u32(y,V,1);
  662#elif defined(_XM_SSE_INTRINSICS_)
  663    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
  664    _mm_store_ss(reinterpret_cast<float *>(y),vResult);
  665#else // _XM_VMX128_INTRINSICS_
  666#endif // _XM_VMX128_INTRINSICS_
  667}
  668
  669// Store the Z component into a 32 bit integer locaCantion in memory.
  670_Use_decl_annotations_
  671inline void XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
  672{
  673    assert( z != NULL );
  674#if defined(_XM_NO_INTRINSICS_)
  675    *z = V.vector4_u32[2];
  676#elif defined(_XM_ARM_NEON_INTRINSICS_)
  677    vst1q_lane_u32(z,V,2);
  678#elif defined(_XM_SSE_INTRINSICS_)
  679    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
  680    _mm_store_ss(reinterpret_cast<float *>(z),vResult);
  681#else // _XM_VMX128_INTRINSICS_
  682#endif // _XM_VMX128_INTRINSICS_
  683}
  684
  685// Store the W component into a 32 bit integer location in memory.
  686_Use_decl_annotations_
  687inline void XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
  688{
  689    assert( w != NULL );
  690#if defined(_XM_NO_INTRINSICS_)
  691    *w = V.vector4_u32[3];
  692#elif defined(_XM_ARM_NEON_INTRINSICS_)
  693    vst1q_lane_u32(w,V,3);
  694#elif defined(_XM_SSE_INTRINSICS_)
  695    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
  696    _mm_store_ss(reinterpret_cast<float *>(w),vResult);
  697#else // _XM_VMX128_INTRINSICS_
  698#endif // _XM_VMX128_INTRINSICS_
  699}
  700
  701//------------------------------------------------------------------------------
  702
  703// Set a single indexed floating point component
  704inline XMVECTOR XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
  705{
  706    assert( i < 4 );
  707    _Analysis_assume_( i < 4 );
  708#if defined(_XM_NO_INTRINSICS_)
  709    XMVECTOR U;
  710    U = V;
  711    U.vector4_f32[i] = f;
  712    return U;
  713#elif defined(_XM_ARM_NEON_INTRINSICS_)
  714    XMVECTOR U = V;
  715    U.n128_f32[i] = f;
  716    return U;
  717#elif defined(_XM_SSE_INTRINSICS_)
  718    XMVECTOR U = V;
  719    U.m128_f32[i] = f;
  720    return U;
  721#else // _XM_VMX128_INTRINSICS_
  722#endif // _XM_VMX128_INTRINSICS_
  723}
  724
  725//------------------------------------------------------------------------------
  726
  727// Sets the X component of a vector to a passed floating point value
  728inline XMVECTOR XMVectorSetX(FXMVECTOR V, float x)
  729{
  730#if defined(_XM_NO_INTRINSICS_)
  731    XMVECTOR U;
  732    U.vector4_f32[0] = x;
  733    U.vector4_f32[1] = V.vector4_f32[1];
  734    U.vector4_f32[2] = V.vector4_f32[2];
  735    U.vector4_f32[3] = V.vector4_f32[3];
  736    return U;
  737#elif defined(_XM_ARM_NEON_INTRINSICS_)
  738    return vsetq_lane_f32(x,V,0);
  739#elif defined(_XM_SSE_INTRINSICS_)
  740    XMVECTOR vResult = _mm_set_ss(x);
  741    vResult = _mm_move_ss(V,vResult);
  742    return vResult;
  743#else // _XM_VMX128_INTRINSICS_
  744#endif // _XM_VMX128_INTRINSICS_
  745}
  746
  747// Sets the Y component of a vector to a passed floating point value
  748inline XMVECTOR XMVectorSetY(FXMVECTOR V, float y)
  749{
  750#if defined(_XM_NO_INTRINSICS_)
  751    XMVECTOR U;
  752    U.vector4_f32[0] = V.vector4_f32[0];
  753    U.vector4_f32[1] = y;
  754    U.vector4_f32[2] = V.vector4_f32[2];
  755    U.vector4_f32[3] = V.vector4_f32[3];
  756    return U;
  757#elif defined(_XM_ARM_NEON_INTRINSICS_)
  758    return vsetq_lane_f32(y,V,1);
  759#elif defined(_XM_SSE_INTRINSICS_)
  760    // Swap y and x
  761    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
  762    // Convert input to vector
  763    XMVECTOR vTemp = _mm_set_ss(y);
  764    // Replace the x component
  765    vResult = _mm_move_ss(vResult,vTemp);
  766    // Swap y and x again
  767    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
  768    return vResult;
  769#else // _XM_VMX128_INTRINSICS_
  770#endif // _XM_VMX128_INTRINSICS_
  771}
  772// Sets the Z component of a vector to a passed floating point value
  773inline XMVECTOR XMVectorSetZ(FXMVECTOR V, float z)
  774{
  775#if defined(_XM_NO_INTRINSICS_)
  776    XMVECTOR U;
  777    U.vector4_f32[0] = V.vector4_f32[0];
  778    U.vector4_f32[1] = V.vector4_f32[1];
  779    U.vector4_f32[2] = z;
  780    U.vector4_f32[3] = V.vector4_f32[3];
  781    return U;
  782#elif defined(_XM_ARM_NEON_INTRINSICS_)
  783    return vsetq_lane_f32(z,V,2);
  784#elif defined(_XM_SSE_INTRINSICS_)
  785    // Swap z and x
  786    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
  787    // Convert input to vector
  788    XMVECTOR vTemp = _mm_set_ss(z);
  789    // Replace the x component
  790    vResult = _mm_move_ss(vResult,vTemp);
  791    // Swap z and x again
  792    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
  793    return vResult;
  794#else // _XM_VMX128_INTRINSICS_
  795#endif // _XM_VMX128_INTRINSICS_
  796}
  797
  798// Sets the W component of a vector to a passed floating point value
  799inline XMVECTOR XMVectorSetW(FXMVECTOR V, float w)
  800{
  801#if defined(_XM_NO_INTRINSICS_)
  802    XMVECTOR U;
  803    U.vector4_f32[0] = V.vector4_f32[0];
  804    U.vector4_f32[1] = V.vector4_f32[1];
  805    U.vector4_f32[2] = V.vector4_f32[2];
  806    U.vector4_f32[3] = w;
  807    return U;
  808#elif defined(_XM_ARM_NEON_INTRINSICS_)
  809    return vsetq_lane_f32(w,V,3);
  810#elif defined(_XM_SSE_INTRINSICS_)
  811    // Swap w and x
  812    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
  813    // Convert input to vector
  814    XMVECTOR vTemp = _mm_set_ss(w);
  815    // Replace the x component
  816    vResult = _mm_move_ss(vResult,vTemp);
  817    // Swap w and x again
  818    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
  819    return vResult;
  820#else // _XM_VMX128_INTRINSICS_
  821#endif // _XM_VMX128_INTRINSICS_
  822}
  823
  824//------------------------------------------------------------------------------
  825
  826// Sets a component of a vector to a floating point value passed by pointer
  827_Use_decl_annotations_
  828inline XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i)
  829{
  830    assert( f != NULL );
  831    assert( i < 4 );
  832    _Analysis_assume_( i < 4 );
  833#if defined(_XM_NO_INTRINSICS_)
  834    XMVECTOR U;
  835    U = V;
  836    U.vector4_f32[i] = *f;
  837    return U;
  838#elif defined(_XM_ARM_NEON_INTRINSICS_)
  839    XMVECTOR U = V;
  840    U.n128_f32[i] = *f;
  841    return U;
  842#elif defined(_XM_SSE_INTRINSICS_)
  843    XMVECTOR U = V;
  844    U.m128_f32[i] = *f;
  845    return U;
  846#else // _XM_VMX128_INTRINSICS_
  847#endif // _XM_VMX128_INTRINSICS_
  848}
  849
  850//------------------------------------------------------------------------------
  851
  852// Sets the X component of a vector to a floating point value passed by pointer
  853_Use_decl_annotations_
  854inline XMVECTOR XMVectorSetXPtr(FXMVECTOR V, const float *x)
  855{
  856    assert( x != NULL );
  857#if defined(_XM_NO_INTRINSICS_)
  858    XMVECTOR U;
  859    U.vector4_f32[0] = *x;
  860    U.vector4_f32[1] = V.vector4_f32[1];
  861    U.vector4_f32[2] = V.vector4_f32[2];
  862    U.vector4_f32[3] = V.vector4_f32[3];
  863    return U;
  864#elif defined(_XM_ARM_NEON_INTRINSICS_)
  865    return vld1q_lane_f32(x,V,0);
  866#elif defined(_XM_SSE_INTRINSICS_)
  867    XMVECTOR vResult = _mm_load_ss(x);
  868    vResult = _mm_move_ss(V,vResult);
  869    return vResult;
  870#else // _XM_VMX128_INTRINSICS_
  871#endif // _XM_VMX128_INTRINSICS_
  872}
  873
  874// Sets the Y component of a vector to a floating point value passed by pointer
  875_Use_decl_annotations_
  876inline XMVECTOR XMVectorSetYPtr(FXMVECTOR V, const float *y)
  877{
  878    assert( y != NULL );
  879#if defined(_XM_NO_INTRINSICS_)
  880    XMVECTOR U;
  881    U.vector4_f32[0] = V.vector4_f32[0];
  882    U.vector4_f32[1] = *y;
  883    U.vector4_f32[2] = V.vector4_f32[2];
  884    U.vector4_f32[3] = V.vector4_f32[3];
  885    return U;
  886#elif defined(_XM_ARM_NEON_INTRINSICS_)
  887    return vld1q_lane_f32(y,V,1);
  888#elif defined(_XM_SSE_INTRINSICS_)
  889    // Swap y and x
  890    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
  891    // Convert input to vector
  892    XMVECTOR vTemp = _mm_load_ss(y);
  893    // Replace the x component
  894    vResult = _mm_move_ss(vResult,vTemp);
  895    // Swap y and x again
  896    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
  897    return vResult;
  898#else // _XM_VMX128_INTRINSICS_
  899#endif // _XM_VMX128_INTRINSICS_
  900}
  901
  902// Sets the Z component of a vector to a floating point value passed by pointer
  903_Use_decl_annotations_
  904inline XMVECTOR XMVectorSetZPtr(FXMVECTOR V, const float *z)
  905{
  906    assert( z != NULL );
  907#if defined(_XM_NO_INTRINSICS_)
  908    XMVECTOR U;
  909    U.vector4_f32[0] = V.vector4_f32[0];
  910    U.vector4_f32[1] = V.vector4_f32[1];
  911    U.vector4_f32[2] = *z;
  912    U.vector4_f32[3] = V.vector4_f32[3];
  913    return U;
  914#elif defined(_XM_ARM_NEON_INTRINSICS_)
  915    return vld1q_lane_f32(z,V,2);
  916#elif defined(_XM_SSE_INTRINSICS_)
  917    // Swap z and x
  918    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
  919    // Convert input to vector
  920    XMVECTOR vTemp = _mm_load_ss(z);
  921    // Replace the x component
  922    vResult = _mm_move_ss(vResult,vTemp);
  923    // Swap z and x again
  924    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
  925    return vResult;
  926#else // _XM_VMX128_INTRINSICS_
  927#endif // _XM_VMX128_INTRINSICS_
  928}
  929
  930// Sets the W component of a vector to a floating point value passed by pointer
  931_Use_decl_annotations_
  932inline XMVECTOR XMVectorSetWPtr(FXMVECTOR V, const float *w)
  933{
  934    assert( w != NULL );
  935#if defined(_XM_NO_INTRINSICS_)
  936    XMVECTOR U;
  937    U.vector4_f32[0] = V.vector4_f32[0];
  938    U.vector4_f32[1] = V.vector4_f32[1];
  939    U.vector4_f32[2] = V.vector4_f32[2];
  940    U.vector4_f32[3] = *w;
  941    return U;
  942#elif defined(_XM_ARM_NEON_INTRINSICS_)
  943    return vld1q_lane_f32(w,V,3);
  944#elif defined(_XM_SSE_INTRINSICS_)
  945    // Swap w and x
  946    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
  947    // Convert input to vector
  948    XMVECTOR vTemp = _mm_load_ss(w);
  949    // Replace the x component
  950    vResult = _mm_move_ss(vResult,vTemp);
  951    // Swap w and x again
  952    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
  953    return vResult;
  954#else // _XM_VMX128_INTRINSICS_
  955#endif // _XM_VMX128_INTRINSICS_
  956}
  957
  958//------------------------------------------------------------------------------
  959
  960// Sets a component of a vector to an integer passed by value
  961inline XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i)
  962{
  963    assert( i < 4 );
  964    _Analysis_assume_( i < 4 );
  965#if defined(_XM_NO_INTRINSICS_)
  966    XMVECTOR U;
  967    U = V;
  968    U.vector4_u32[i] = x;
  969    return U;
  970#elif defined(_XM_ARM_NEON_INTRINSICS_)
  971    XMVECTORU32 tmp;
  972    tmp.v = V;
  973    tmp.u[i] = x;
  974    return tmp;
  975#elif defined(_XM_SSE_INTRINSICS_)
  976    XMVECTORU32 tmp;
  977    tmp.v = V;
  978    tmp.u[i] = x;
  979    return tmp;
  980#else // _XM_VMX128_INTRINSICS_
  981#endif // _XM_VMX128_INTRINSICS_
  982}
  983
  984//------------------------------------------------------------------------------
  985
  986// Sets the X component of a vector to an integer passed by value
  987inline XMVECTOR XMVectorSetIntX(FXMVECTOR V, uint32_t x)
  988{
  989#if defined(_XM_NO_INTRINSICS_)
  990    XMVECTOR U;
  991    U.vector4_u32[0] = x;
  992    U.vector4_u32[1] = V.vector4_u32[1];
  993    U.vector4_u32[2] = V.vector4_u32[2];
  994    U.vector4_u32[3] = V.vector4_u32[3];
  995    return U;
  996#elif defined(_XM_ARM_NEON_INTRINSICS_)
  997    return vsetq_lane_u32(x,V,0);
  998#elif defined(_XM_SSE_INTRINSICS_)
  999    __m128i vTemp = _mm_cvtsi32_si128(x);
 1000    XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp));
 1001    return vResult;
 1002#else // _XM_VMX128_INTRINSICS_
 1003#endif // _XM_VMX128_INTRINSICS_
 1004}
 1005
 1006// Sets the Y component of a vector to an integer passed by value
 1007inline XMVECTOR XMVectorSetIntY(FXMVECTOR V, uint32_t y)
 1008{
 1009#if defined(_XM_NO_INTRINSICS_)
 1010    XMVECTOR U;
 1011    U.vector4_u32[0] = V.vector4_u32[0];
 1012    U.vector4_u32[1] = y;
 1013    U.vector4_u32[2] = V.vector4_u32[2];
 1014    U.vector4_u32[3] = V.vector4_u32[3];
 1015    return U;
 1016#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1017    return vsetq_lane_u32(y,V,1);
 1018#elif defined(_XM_SSE_INTRINSICS_)
 1019    // Swap y and x
 1020    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
 1021    // Convert input to vector
 1022    __m128i vTemp = _mm_cvtsi32_si128(y);
 1023    // Replace the x component
 1024    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
 1025    // Swap y and x again
 1026    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
 1027    return vResult;
 1028#else // _XM_VMX128_INTRINSICS_
 1029#endif // _XM_VMX128_INTRINSICS_
 1030}
 1031
 1032// Sets the Z component of a vector to an integer passed by value
 1033inline XMVECTOR XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
 1034{
 1035#if defined(_XM_NO_INTRINSICS_)
 1036    XMVECTOR U;
 1037    U.vector4_u32[0] = V.vector4_u32[0];
 1038    U.vector4_u32[1] = V.vector4_u32[1];
 1039    U.vector4_u32[2] = z;
 1040    U.vector4_u32[3] = V.vector4_u32[3];
 1041    return U;
 1042#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1043    return vsetq_lane_u32(z,V,2);
 1044#elif defined(_XM_SSE_INTRINSICS_)
 1045    // Swap z and x
 1046    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
 1047    // Convert input to vector
 1048    __m128i vTemp = _mm_cvtsi32_si128(z);
 1049    // Replace the x component
 1050    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
 1051    // Swap z and x again
 1052    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
 1053    return vResult;
 1054#else // _XM_VMX128_INTRINSICS_
 1055#endif // _XM_VMX128_INTRINSICS_
 1056}
 1057
 1058// Sets the W component of a vector to an integer passed by value
 1059inline XMVECTOR XMVectorSetIntW(FXMVECTOR V, uint32_t w)
 1060{
 1061#if defined(_XM_NO_INTRINSICS_)
 1062    XMVECTOR U;
 1063    U.vector4_u32[0] = V.vector4_u32[0];
 1064    U.vector4_u32[1] = V.vector4_u32[1];
 1065    U.vector4_u32[2] = V.vector4_u32[2];
 1066    U.vector4_u32[3] = w;
 1067    return U;
 1068#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1069    return vsetq_lane_u32(w,V,3);
 1070#elif defined(_XM_SSE_INTRINSICS_)
 1071    // Swap w and x
 1072    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
 1073    // Convert input to vector
 1074    __m128i vTemp = _mm_cvtsi32_si128(w);
 1075    // Replace the x component
 1076    vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
 1077    // Swap w and x again
 1078    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
 1079    return vResult;
 1080#else // _XM_VMX128_INTRINSICS_
 1081#endif // _XM_VMX128_INTRINSICS_
 1082}
 1083
 1084//------------------------------------------------------------------------------
 1085
 1086// Sets a component of a vector to an integer value passed by pointer
 1087_Use_decl_annotations_
 1088inline XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i)
 1089{
 1090    assert( x != NULL );
 1091    assert( i < 4 );
 1092    _Analysis_assume_( i < 4 );
 1093#if defined(_XM_NO_INTRINSICS_)
 1094    XMVECTOR U;
 1095    U = V;
 1096    U.vector4_u32[i] = *x;
 1097    return U;
 1098#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1099    XMVECTORU32 tmp;
 1100    tmp.v = V;
 1101    tmp.u[i] = *x;
 1102    return tmp;
 1103#elif defined(_XM_SSE_INTRINSICS_)
 1104    XMVECTORU32 tmp;
 1105    tmp.v = V;
 1106    tmp.u[i] = *x;
 1107    return tmp;
 1108#else // _XM_VMX128_INTRINSICS_
 1109#endif // _XM_VMX128_INTRINSICS_
 1110}
 1111
 1112//------------------------------------------------------------------------------
 1113
 1114// Sets the X component of a vector to an integer value passed by pointer
 1115_Use_decl_annotations_
 1116inline XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x)
 1117{
 1118    assert( x != NULL );
 1119#if defined(_XM_NO_INTRINSICS_)
 1120    XMVECTOR U;
 1121    U.vector4_u32[0] = *x;
 1122    U.vector4_u32[1] = V.vector4_u32[1];
 1123    U.vector4_u32[2] = V.vector4_u32[2];
 1124    U.vector4_u32[3] = V.vector4_u32[3];
 1125    return U;
 1126#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1127    return vld1q_lane_u32(x,V,0);
 1128#elif defined(_XM_SSE_INTRINSICS_)
 1129    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x));
 1130    XMVECTOR vResult = _mm_move_ss(V,vTemp);
 1131    return vResult;
 1132#else // _XM_VMX128_INTRINSICS_
 1133#endif // _XM_VMX128_INTRINSICS_
 1134}
 1135
 1136// Sets the Y component of a vector to an integer value passed by pointer
 1137_Use_decl_annotations_
 1138inline XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y)
 1139{
 1140    assert( y != NULL );
 1141#if defined(_XM_NO_INTRINSICS_)
 1142    XMVECTOR U;
 1143    U.vector4_u32[0] = V.vector4_u32[0];
 1144    U.vector4_u32[1] = *y;
 1145    U.vector4_u32[2] = V.vector4_u32[2];
 1146    U.vector4_u32[3] = V.vector4_u32[3];
 1147    return U;
 1148#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1149    return vld1q_lane_u32(y,V,1);
 1150#elif defined(_XM_SSE_INTRINSICS_)
 1151    // Swap y and x
 1152    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
 1153    // Convert input to vector
 1154    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y));
 1155    // Replace the x component
 1156    vResult = _mm_move_ss(vResult,vTemp);
 1157    // Swap y and x again
 1158    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
 1159    return vResult;
 1160#else // _XM_VMX128_INTRINSICS_
 1161#endif // _XM_VMX128_INTRINSICS_
 1162}
 1163
 1164// Sets the Z component of a vector to an integer value passed by pointer
 1165_Use_decl_annotations_
 1166inline XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z)
 1167{
 1168    assert( z != NULL );
 1169#if defined(_XM_NO_INTRINSICS_)
 1170    XMVECTOR U;
 1171    U.vector4_u32[0] = V.vector4_u32[0];
 1172    U.vector4_u32[1] = V.vector4_u32[1];
 1173    U.vector4_u32[2] = *z;
 1174    U.vector4_u32[3] = V.vector4_u32[3];
 1175    return U;
 1176#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1177    return vld1q_lane_u32(z,V,2);
 1178#elif defined(_XM_SSE_INTRINSICS_)
 1179    // Swap z and x
 1180    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
 1181    // Convert input to vector
 1182    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z));
 1183    // Replace the x component
 1184    vResult = _mm_move_ss(vResult,vTemp);
 1185    // Swap z and x again
 1186    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
 1187    return vResult;
 1188#else // _XM_VMX128_INTRINSICS_
 1189#endif // _XM_VMX128_INTRINSICS_
 1190}
 1191
 1192// Sets the W component of a vector to an integer value passed by pointer
 1193_Use_decl_annotations_
 1194inline XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w)
 1195{
 1196    assert( w != NULL );
 1197#if defined(_XM_NO_INTRINSICS_)
 1198    XMVECTOR U;
 1199    U.vector4_u32[0] = V.vector4_u32[0];
 1200    U.vector4_u32[1] = V.vector4_u32[1];
 1201    U.vector4_u32[2] = V.vector4_u32[2];
 1202    U.vector4_u32[3] = *w;
 1203    return U;
 1204#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1205    return vld1q_lane_u32(w,V,3);
 1206#elif defined(_XM_SSE_INTRINSICS_)
 1207    // Swap w and x
 1208    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
 1209    // Convert input to vector
 1210    XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w));
 1211    // Replace the x component
 1212    vResult = _mm_move_ss(vResult,vTemp);
 1213    // Swap w and x again
 1214    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
 1215    return vResult;
 1216#else // _XM_VMX128_INTRINSICS_
 1217#endif // _XM_VMX128_INTRINSICS_
 1218}
 1219
 1220//------------------------------------------------------------------------------
 1221
 1222inline XMVECTOR XMVectorSwizzle
 1223(
 1224    FXMVECTOR V,
 1225    uint32_t E0,
 1226    uint32_t E1,
 1227    uint32_t E2,
 1228    uint32_t E3
 1229)
 1230{
 1231    assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
 1232    _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
 1233#if defined(_XM_NO_INTRINSICS_)
 1234
 1235    XMVECTOR Result = { V.vector4_f32[E0],
 1236                        V.vector4_f32[E1],
 1237                        V.vector4_f32[E2],
 1238                        V.vector4_f32[E3] };
 1239    return Result;
 1240
 1241#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1242    static const uint32_t ControlElement[ 4 ] =
 1243    {
 1244#ifdef _XM_LITTLEENDIAN_
 1245        0x03020100, // XM_SWIZZLE_X
 1246        0x07060504, // XM_SWIZZLE_Y
 1247        0x0B0A0908, // XM_SWIZZLE_Z
 1248        0x0F0E0D0C, // XM_SWIZZLE_W
 1249#else
 1250        0x00010203, // XM_SWIZZLE_X
 1251        0x04050607, // XM_SWIZZLE_Y
 1252        0x08090A0B, // XM_SWIZZLE_Z
 1253        0x0C0D0E0F, // XM_SWIZZLE_W
 1254#endif
 1255    };
 1256
 1257    int8x8x2_t tbl;
 1258    tbl.val[0] = vget_low_f32(V);
 1259    tbl.val[1] = vget_high_f32(V);
 1260
 1261    __n64 idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) );
 1262    const __n64 rL = vtbl2_u8( tbl, idx );
 1263
 1264    idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) );
 1265    const __n64 rH = vtbl2_u8( tbl, idx );
 1266
 1267    return vcombine_f32( rL, rH );
 1268#elif defined(_XM_VMX128_INTRINSICS_)
 1269#else
 1270    const uint32_t *aPtr = (const uint32_t* )(&V);
 1271
 1272    XMVECTOR Result;
 1273    uint32_t *pWork = (uint32_t*)(&Result);
 1274
 1275    pWork[0] = aPtr[E0];
 1276    pWork[1] = aPtr[E1];
 1277    pWork[2] = aPtr[E2];
 1278    pWork[3] = aPtr[E3];
 1279
 1280    return Result;
 1281#endif
 1282}
 1283
 1284//------------------------------------------------------------------------------
 1285inline XMVECTOR XMVectorPermute
 1286(
 1287    FXMVECTOR V1,
 1288    FXMVECTOR V2,
 1289    uint32_t PermuteX,
 1290    uint32_t PermuteY,
 1291    uint32_t PermuteZ,
 1292    uint32_t PermuteW
 1293)
 1294{
 1295    assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
 1296    _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
 1297
 1298#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 1299    static const uint32_t ControlElement[ 8 ] =
 1300    {
 1301#ifdef _XM_LITTLEENDIAN_
 1302        0x03020100, // XM_PERMUTE_0X
 1303        0x07060504, // XM_PERMUTE_0Y
 1304        0x0B0A0908, // XM_PERMUTE_0Z
 1305        0x0F0E0D0C, // XM_PERMUTE_0W
 1306        0x13121110, // XM_PERMUTE_1X
 1307        0x17161514, // XM_PERMUTE_1Y
 1308        0x1B1A1918, // XM_PERMUTE_1Z
 1309        0x1F1E1D1C, // XM_PERMUTE_1W
 1310#else
 1311        0x00010203, // XM_PERMUTE_0X
 1312        0x04050607, // XM_PERMUTE_0Y
 1313        0x08090A0B, // XM_PERMUTE_0Z
 1314        0x0C0D0E0F, // XM_PERMUTE_0W
 1315        0x10111213, // XM_PERMUTE_1X
 1316        0x14151617, // XM_PERMUTE_1Y
 1317        0x18191A1B, // XM_PERMUTE_1Z
 1318        0x1C1D1E1F, // XM_PERMUTE_1W
 1319#endif
 1320    };
 1321
 1322    int8x8x4_t tbl;
 1323    tbl.val[0] = vget_low_f32(V1);
 1324    tbl.val[1] = vget_high_f32(V1);
 1325    tbl.val[2] = vget_low_f32(V2);
 1326    tbl.val[3] = vget_high_f32(V2);
 1327
 1328    __n64 idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) );
 1329    const __n64 rL = vtbl4_u8( tbl, idx );
 1330
 1331    idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) );
 1332    const __n64 rH = vtbl4_u8( tbl, idx );
 1333
 1334    return vcombine_f32( rL, rH );
 1335#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 1336#else
 1337 
 1338    const uint32_t *aPtr[2];
 1339    aPtr[0] = (const uint32_t* )(&V1);
 1340    aPtr[1] = (const uint32_t* )(&V2);
 1341
 1342    XMVECTOR Result;
 1343    uint32_t *pWork = (uint32_t*)(&Result);
 1344
 1345    const uint32_t i0 = PermuteX & 3;
 1346    const uint32_t vi0 = PermuteX >> 2;
 1347    pWork[0] = aPtr[vi0][i0];
 1348
 1349    const uint32_t i1 = PermuteY & 3;
 1350    const uint32_t vi1 = PermuteY >> 2;
 1351    pWork[1] = aPtr[vi1][i1];
 1352
 1353    const uint32_t i2 = PermuteZ & 3;
 1354    const uint32_t vi2 = PermuteZ >> 2;
 1355    pWork[2] = aPtr[vi2][i2];
 1356
 1357    const uint32_t i3 = PermuteW & 3;
 1358    const uint32_t vi3 = PermuteW >> 2;
 1359    pWork[3] = aPtr[vi3][i3];
 1360
 1361    return Result;
 1362#endif
 1363}
 1364
 1365//------------------------------------------------------------------------------
 1366// Define a control vector to be used in XMVectorSelect 
 1367// operations.  The four integers specified in XMVectorSelectControl
 1368// serve as indices to select between components in two vectors.
 1369// The first index controls selection for the first component of 
 1370// the vectors involved in a select operation, the second index 
 1371// controls selection for the second component etc.  A value of
 1372// zero for an index causes the corresponding component from the first 
 1373// vector to be selected whereas a one causes the component from the
 1374// second vector to be selected instead.
 1375
 1376inline XMVECTOR XMVectorSelectControl
 1377(
 1378    uint32_t VectorIndex0, 
 1379    uint32_t VectorIndex1, 
 1380    uint32_t VectorIndex2, 
 1381    uint32_t VectorIndex3
 1382)
 1383{
 1384#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 1385    // x=Index0,y=Index1,z=Index2,w=Index3
 1386    __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0);
 1387    // Any non-zero entries become 0xFFFFFFFF else 0
 1388    vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero);
 1389    return reinterpret_cast<__m128 *>(&vTemp)[0];
 1390#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 1391    __n64 V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32));
 1392    __n64 V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32));
 1393    __n128 vTemp = vcombine_s32(V0, V1);
 1394    // Any non-zero entries become 0xFFFFFFFF else 0
 1395    return vcgtq_s32(vTemp,g_XMZero);
 1396#else
 1397    XMVECTOR    ControlVector;
 1398    const uint32_t  ControlElement[] =
 1399                {
 1400                    XM_SELECT_0,
 1401                    XM_SELECT_1
 1402                };
 1403
 1404    assert(VectorIndex0 < 2);
 1405    assert(VectorIndex1 < 2);
 1406    assert(VectorIndex2 < 2);
 1407    assert(VectorIndex3 < 2);
 1408    _Analysis_assume_(VectorIndex0 < 2);
 1409    _Analysis_assume_(VectorIndex1 < 2);
 1410    _Analysis_assume_(VectorIndex2 < 2);
 1411    _Analysis_assume_(VectorIndex3 < 2);
 1412
 1413    ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
 1414    ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
 1415    ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
 1416    ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
 1417
 1418    return ControlVector;
 1419
 1420#endif
 1421}
 1422
 1423//------------------------------------------------------------------------------
 1424
 1425inline XMVECTOR XMVectorSelect
 1426(
 1427    FXMVECTOR V1, 
 1428    FXMVECTOR V2, 
 1429    FXMVECTOR Control
 1430)
 1431{
 1432#if defined(_XM_NO_INTRINSICS_)
 1433
 1434    XMVECTOR Result;
 1435    Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]);
 1436    Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]);
 1437    Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]);
 1438    Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]);
 1439    return Result;
 1440
 1441#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1442    return vbslq_f32( Control, V2, V1 );
 1443#elif defined(_XM_SSE_INTRINSICS_)
 1444    XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1);
 1445    XMVECTOR vTemp2 = _mm_and_ps(V2,Control);
 1446    return _mm_or_ps(vTemp1,vTemp2);
 1447#else // _XM_VMX128_INTRINSICS_
 1448#endif // _XM_VMX128_INTRINSICS_
 1449}
 1450
 1451//------------------------------------------------------------------------------
 1452
 1453inline XMVECTOR XMVectorMergeXY
 1454(
 1455    FXMVECTOR V1, 
 1456    FXMVECTOR V2
 1457)
 1458{
 1459#if defined(_XM_NO_INTRINSICS_)
 1460
 1461    XMVECTOR Result;
 1462    Result.vector4_u32[0] = V1.vector4_u32[0];
 1463    Result.vector4_u32[1] = V2.vector4_u32[0];
 1464    Result.vector4_u32[2] = V1.vector4_u32[1];
 1465    Result.vector4_u32[3] = V2.vector4_u32[1];
 1466    return Result;
 1467
 1468#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1469    return vzipq_f32( V1, V2 ).val[0];
 1470#elif defined(_XM_SSE_INTRINSICS_)
 1471    return _mm_unpacklo_ps( V1, V2 );
 1472#else // _XM_VMX128_INTRINSICS_
 1473#endif // _XM_VMX128_INTRINSICS_
 1474}
 1475
 1476//------------------------------------------------------------------------------
 1477
 1478inline XMVECTOR XMVectorMergeZW
 1479(
 1480    FXMVECTOR V1, 
 1481    FXMVECTOR V2
 1482)
 1483{
 1484#if defined(_XM_NO_INTRINSICS_)
 1485
 1486    XMVECTOR Result;
 1487    Result.vector4_u32[0] = V1.vector4_u32[2];
 1488    Result.vector4_u32[1] = V2.vector4_u32[2];
 1489    Result.vector4_u32[2] = V1.vector4_u32[3];
 1490    Result.vector4_u32[3] = V2.vector4_u32[3];
 1491    return Result;
 1492
 1493#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1494    return vzipq_f32( V1, V2 ).val[1];
 1495#elif defined(_XM_SSE_INTRINSICS_)
 1496    return _mm_unpackhi_ps( V1, V2 );
 1497#else // _XM_VMX128_INTRINSICS_
 1498#endif // _XM_VMX128_INTRINSICS_
 1499}
 1500
 1501//------------------------------------------------------------------------------
 1502
 1503inline XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
 1504{
 1505    assert( Elements < 4 );
 1506    _Analysis_assume_( Elements < 4 );
 1507    return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
 1508}
 1509
 1510//------------------------------------------------------------------------------
 1511
 1512inline XMVECTOR XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
 1513{
 1514    assert( Elements < 4 );
 1515    _Analysis_assume_( Elements < 4 );
 1516    return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
 1517}
 1518
 1519//------------------------------------------------------------------------------
 1520
 1521inline XMVECTOR XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
 1522{
 1523    assert( Elements < 4 );
 1524    _Analysis_assume_( Elements < 4 );
 1525    return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
 1526}
 1527
 1528//------------------------------------------------------------------------------
 1529
 1530inline XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
 1531                                  uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3)
 1532{
 1533    XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
 1534    return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control );
 1535}
 1536
 1537//------------------------------------------------------------------------------
 1538// Comparison operations
 1539//------------------------------------------------------------------------------
 1540
 1541//------------------------------------------------------------------------------
 1542
 1543inline XMVECTOR XMVectorEqual
 1544(
 1545    FXMVECTOR V1, 
 1546    FXMVECTOR V2
 1547)
 1548{
 1549#if defined(_XM_NO_INTRINSICS_)
 1550
 1551    XMVECTOR Control;
 1552    Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
 1553    Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
 1554    Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
 1555    Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
 1556    return Control;
 1557
 1558#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1559    return vceqq_f32( V1, V2 );
 1560#elif defined(_XM_SSE_INTRINSICS_)
 1561    return _mm_cmpeq_ps( V1, V2 );
 1562#else // _XM_VMX128_INTRINSICS_
 1563#endif // _XM_VMX128_INTRINSICS_
 1564}
 1565
 1566//------------------------------------------------------------------------------
 1567
 1568_Use_decl_annotations_
 1569inline XMVECTOR XMVectorEqualR
 1570(
 1571    uint32_t*    pCR,
 1572    FXMVECTOR V1, 
 1573    FXMVECTOR V2
 1574)
 1575{
 1576    assert( pCR != NULL );
 1577#if defined(_XM_NO_INTRINSICS_)
 1578    uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
 1579    uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
 1580    uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
 1581    uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
 1582    uint32_t CR = 0;
 1583    if (ux&uy&uz&uw)
 1584    {
 1585        // All elements are greater
 1586        CR = XM_CRMASK_CR6TRUE;
 1587    }
 1588    else if (!(ux|uy|uz|uw))
 1589    {
 1590        // All elements are not greater
 1591        CR = XM_CRMASK_CR6FALSE;
 1592    }
 1593    *pCR = CR;
 1594
 1595    XMVECTOR Control;
 1596    Control.vector4_u32[0] = ux;
 1597    Control.vector4_u32[1] = uy;
 1598    Control.vector4_u32[2] = uz;
 1599    Control.vector4_u32[3] = uw;
 1600    return Control;
 1601
 1602#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1603    __n128 vResult = vceqq_f32( V1, V2 );
 1604    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 1605    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 1606    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 1607    uint32_t CR = 0;
 1608    if ( r == 0xFFFFFFFFU )
 1609    {
 1610        // All elements are equal
 1611        CR = XM_CRMASK_CR6TRUE;
 1612    }
 1613    else if ( !r )
 1614    {
 1615        // All elements are not equal
 1616        CR = XM_CRMASK_CR6FALSE;
 1617    }
 1618    *pCR = CR;
 1619    return vResult;
 1620#elif defined(_XM_SSE_INTRINSICS_)
 1621    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 1622    uint32_t CR = 0;
 1623    int iTest = _mm_movemask_ps(vTemp);
 1624    if (iTest==0xf)
 1625    {
 1626        CR = XM_CRMASK_CR6TRUE;
 1627    }
 1628    else if (!iTest)
 1629    {
 1630        // All elements are not greater
 1631        CR = XM_CRMASK_CR6FALSE;
 1632    }
 1633    *pCR = CR;
 1634    return vTemp;
 1635#else // _XM_VMX128_INTRINSICS_
 1636#endif // _XM_VMX128_INTRINSICS_
 1637}
 1638
 1639//------------------------------------------------------------------------------
 1640// Treat the components of the vectors as unsigned integers and
 1641// compare individual bits between the two.  This is useful for
 1642// comparing control vectors and result vectors returned from
 1643// other comparison operations.
 1644
 1645inline XMVECTOR XMVectorEqualInt
 1646(
 1647    FXMVECTOR V1, 
 1648    FXMVECTOR V2
 1649)
 1650{
 1651#if defined(_XM_NO_INTRINSICS_)
 1652
 1653    XMVECTOR Control;
 1654    Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0;
 1655    Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0;
 1656    Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0;
 1657    Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0;
 1658    return Control;
 1659
 1660#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1661    return vceqq_u32( V1, V2 );
 1662#elif defined(_XM_SSE_INTRINSICS_)
 1663    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
 1664    return reinterpret_cast<__m128 *>(&V)[0];
 1665#else // _XM_VMX128_INTRINSICS_
 1666#endif // _XM_VMX128_INTRINSICS_
 1667}
 1668
 1669//------------------------------------------------------------------------------
 1670
 1671_Use_decl_annotations_
 1672inline XMVECTOR XMVectorEqualIntR
 1673(
 1674    uint32_t*    pCR,
 1675    FXMVECTOR V1, 
 1676    FXMVECTOR V2
 1677)
 1678{
 1679    assert( pCR != NULL );
 1680#if defined(_XM_NO_INTRINSICS_)
 1681
 1682    XMVECTOR Control = XMVectorEqualInt(V1, V2);
 1683
 1684    *pCR = 0;
 1685    if (XMVector4EqualInt(Control, XMVectorTrueInt()))
 1686    {
 1687        // All elements are equal
 1688        *pCR |= XM_CRMASK_CR6TRUE;
 1689    }
 1690    else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
 1691    {
 1692        // All elements are not equal
 1693        *pCR |= XM_CRMASK_CR6FALSE;
 1694    }
 1695    return Control;
 1696
 1697#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1698    __n128 vResult = vceqq_u32( V1, V2 );
 1699    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 1700    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 1701    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 1702    uint32_t CR = 0;
 1703    if ( r == 0xFFFFFFFFU )
 1704    {
 1705        // All elements are equal
 1706        CR = XM_CRMASK_CR6TRUE;
 1707    }
 1708    else if ( !r )
 1709    {
 1710        // All elements are not equal
 1711        CR = XM_CRMASK_CR6FALSE;
 1712    }
 1713    *pCR = CR;
 1714    return vResult;
 1715#elif defined(_XM_SSE_INTRINSICS_)
 1716    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
 1717    int iTemp = _mm_movemask_ps(reinterpret_cast<const __m128*>(&V)[0]);
 1718    uint32_t CR = 0;
 1719    if (iTemp==0x0F)
 1720    {
 1721        CR = XM_CRMASK_CR6TRUE;
 1722    }
 1723    else if (!iTemp)
 1724    {
 1725        CR = XM_CRMASK_CR6FALSE;
 1726    }
 1727    *pCR = CR;
 1728    return reinterpret_cast<__m128 *>(&V)[0];
 1729#else // _XM_VMX128_INTRINSICS_
 1730#endif // _XM_VMX128_INTRINSICS_
 1731}
 1732
 1733//------------------------------------------------------------------------------
 1734
 1735inline XMVECTOR XMVectorNearEqual
 1736(
 1737    FXMVECTOR V1, 
 1738    FXMVECTOR V2, 
 1739    FXMVECTOR Epsilon
 1740)
 1741{
 1742#if defined(_XM_NO_INTRINSICS_)
 1743
 1744    float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0];
 1745    float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1];
 1746    float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2];
 1747    float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3];
 1748
 1749    fDeltax = fabsf(fDeltax);
 1750    fDeltay = fabsf(fDeltay);
 1751    fDeltaz = fabsf(fDeltaz);
 1752    fDeltaw = fabsf(fDeltaw);
 1753
 1754    XMVECTOR Control;
 1755    Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
 1756    Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
 1757    Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
 1758    Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
 1759    return Control;
 1760
 1761#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1762    XMVECTOR vDelta = vsubq_f32(V1,V2);
 1763    return vacleq_f32( vDelta, Epsilon );
 1764#elif defined(_XM_SSE_INTRINSICS_)
 1765    // Get the difference
 1766    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
 1767    // Get the absolute value of the difference
 1768    XMVECTOR vTemp = _mm_setzero_ps();
 1769    vTemp = _mm_sub_ps(vTemp,vDelta);
 1770    vTemp = _mm_max_ps(vTemp,vDelta);
 1771    vTemp = _mm_cmple_ps(vTemp,Epsilon);
 1772    return vTemp;
 1773#else // _XM_VMX128_INTRINSICS_
 1774#endif // _XM_VMX128_INTRINSICS_
 1775}
 1776
 1777//------------------------------------------------------------------------------
 1778
 1779inline XMVECTOR XMVectorNotEqual
 1780(
 1781    FXMVECTOR V1, 
 1782    FXMVECTOR V2
 1783)
 1784{
 1785#if defined(_XM_NO_INTRINSICS_)
 1786
 1787    XMVECTOR Control;
 1788    Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
 1789    Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
 1790    Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
 1791    Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
 1792    return Control;
 1793
 1794#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1795    return vmvnq_u32(vceqq_f32(V1, V2));
 1796#elif defined(_XM_SSE_INTRINSICS_)
 1797    return _mm_cmpneq_ps( V1, V2 );
 1798#else // _XM_VMX128_INTRINSICS_
 1799#endif // _XM_VMX128_INTRINSICS_
 1800}
 1801
 1802//------------------------------------------------------------------------------
 1803
 1804inline XMVECTOR XMVectorNotEqualInt
 1805(
 1806    FXMVECTOR V1, 
 1807    FXMVECTOR V2
 1808)
 1809{
 1810#if defined(_XM_NO_INTRINSICS_)
 1811
 1812    XMVECTOR Control;
 1813    Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0;
 1814    Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0;
 1815    Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0;
 1816    Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0;
 1817    return Control;
 1818
 1819#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1820    return vmvnq_u32(vceqq_u32(V1, V2));
 1821#elif defined(_XM_SSE_INTRINSICS_)
 1822    __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
 1823    return _mm_xor_ps(reinterpret_cast<__m128 *>(&V)[0],g_XMNegOneMask);
 1824#else // _XM_VMX128_INTRINSICS_
 1825#endif // _XM_VMX128_INTRINSICS_
 1826}
 1827
 1828//------------------------------------------------------------------------------
 1829
 1830inline XMVECTOR XMVectorGreater
 1831(
 1832    FXMVECTOR V1, 
 1833    FXMVECTOR V2
 1834)
 1835{
 1836#if defined(_XM_NO_INTRINSICS_)
 1837
 1838    XMVECTOR Control;
 1839    Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
 1840    Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
 1841    Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
 1842    Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
 1843    return Control;
 1844
 1845#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1846    return vcgtq_f32( V1, V2 );
 1847#elif defined(_XM_SSE_INTRINSICS_)
 1848    return _mm_cmpgt_ps( V1, V2 );
 1849#else // _XM_VMX128_INTRINSICS_
 1850#endif // _XM_VMX128_INTRINSICS_
 1851}
 1852
 1853//------------------------------------------------------------------------------
 1854
 1855_Use_decl_annotations_
 1856inline XMVECTOR XMVectorGreaterR
 1857(
 1858    uint32_t*    pCR,
 1859    FXMVECTOR V1, 
 1860    FXMVECTOR V2
 1861)
 1862{
 1863    assert( pCR != NULL );
 1864#if defined(_XM_NO_INTRINSICS_)
 1865
 1866    uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
 1867    uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
 1868    uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
 1869    uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
 1870    uint32_t CR = 0;
 1871    if (ux&uy&uz&uw)
 1872    {
 1873        // All elements are greater
 1874        CR = XM_CRMASK_CR6TRUE;
 1875    }
 1876    else if (!(ux|uy|uz|uw))
 1877    {
 1878        // All elements are not greater
 1879        CR = XM_CRMASK_CR6FALSE;
 1880    }
 1881    *pCR = CR;
 1882
 1883    XMVECTOR Control;
 1884    Control.vector4_u32[0] = ux;
 1885    Control.vector4_u32[1] = uy;
 1886    Control.vector4_u32[2] = uz;
 1887    Control.vector4_u32[3] = uw;
 1888    return Control;
 1889
 1890#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1891    __n128 vResult = vcgtq_f32( V1, V2 );
 1892    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 1893    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 1894    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 1895    uint32_t CR = 0;
 1896    if ( r == 0xFFFFFFFFU )
 1897    {
 1898        // All elements are greater
 1899        CR = XM_CRMASK_CR6TRUE;
 1900    }
 1901    else if ( !r )
 1902    {
 1903        // All elements are not greater
 1904        CR = XM_CRMASK_CR6FALSE;
 1905    }
 1906    *pCR = CR;
 1907    return vResult;
 1908#elif defined(_XM_SSE_INTRINSICS_)
 1909    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
 1910    uint32_t CR = 0;
 1911    int iTest = _mm_movemask_ps(vTemp);
 1912    if (iTest==0xf)
 1913    {
 1914        CR = XM_CRMASK_CR6TRUE;
 1915    }
 1916    else if (!iTest)
 1917    {
 1918        // All elements are not greater
 1919        CR = XM_CRMASK_CR6FALSE;
 1920    }
 1921    *pCR = CR;
 1922    return vTemp;
 1923#else // _XM_VMX128_INTRINSICS_
 1924#endif // _XM_VMX128_INTRINSICS_
 1925}
 1926
 1927//------------------------------------------------------------------------------
 1928
 1929inline XMVECTOR XMVectorGreaterOrEqual
 1930(
 1931    FXMVECTOR V1, 
 1932    FXMVECTOR V2
 1933)
 1934{
 1935#if defined(_XM_NO_INTRINSICS_)
 1936
 1937    XMVECTOR Control;
 1938    Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
 1939    Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
 1940    Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
 1941    Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
 1942    return Control;
 1943
 1944#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1945    return vcgeq_f32( V1, V2 );
 1946#elif defined(_XM_SSE_INTRINSICS_)
 1947    return _mm_cmpge_ps( V1, V2 );
 1948#else // _XM_VMX128_INTRINSICS_
 1949#endif // _XM_VMX128_INTRINSICS_
 1950}
 1951
 1952//------------------------------------------------------------------------------
 1953
 1954_Use_decl_annotations_
 1955inline XMVECTOR XMVectorGreaterOrEqualR
 1956(
 1957    uint32_t*    pCR,
 1958    FXMVECTOR V1, 
 1959    FXMVECTOR V2
 1960)
 1961{
 1962    assert( pCR != NULL );
 1963#if defined(_XM_NO_INTRINSICS_)
 1964
 1965    uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
 1966    uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
 1967    uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
 1968    uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
 1969    uint32_t CR = 0;
 1970    if (ux&uy&uz&uw)
 1971    {
 1972        // All elements are greater
 1973        CR = XM_CRMASK_CR6TRUE;
 1974    }
 1975    else if (!(ux|uy|uz|uw))
 1976    {
 1977        // All elements are not greater
 1978        CR = XM_CRMASK_CR6FALSE;
 1979    }
 1980    *pCR = CR;
 1981
 1982    XMVECTOR Control;
 1983    Control.vector4_u32[0] = ux;
 1984    Control.vector4_u32[1] = uy;
 1985    Control.vector4_u32[2] = uz;
 1986    Control.vector4_u32[3] = uw;
 1987    return Control;
 1988
 1989#elif defined(_XM_ARM_NEON_INTRINSICS_)
 1990    __n128 vResult = vcgeq_f32( V1, V2 );
 1991    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 1992    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 1993    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 1994    uint32_t CR = 0;
 1995    if ( r == 0xFFFFFFFFU )
 1996    {
 1997        // All elements are greater or equal
 1998        CR = XM_CRMASK_CR6TRUE;
 1999    }
 2000    else if ( !r )
 2001    {
 2002        // All elements are not greater or equal
 2003        CR = XM_CRMASK_CR6FALSE;
 2004    }
 2005    *pCR = CR;
 2006    return vResult;
 2007#elif defined(_XM_SSE_INTRINSICS_)
 2008    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
 2009    uint32_t CR = 0;
 2010    int iTest = _mm_movemask_ps(vTemp);
 2011    if (iTest==0xf)
 2012    {
 2013        CR = XM_CRMASK_CR6TRUE;
 2014    }
 2015    else if (!iTest)
 2016    {
 2017        // All elements are not greater
 2018        CR = XM_CRMASK_CR6FALSE;
 2019    }
 2020    *pCR = CR;
 2021    return vTemp;
 2022#else // _XM_VMX128_INTRINSICS_
 2023#endif // _XM_VMX128_INTRINSICS_
 2024}
 2025
 2026//------------------------------------------------------------------------------
 2027
 2028inline XMVECTOR XMVectorLess
 2029(
 2030    FXMVECTOR V1, 
 2031    FXMVECTOR V2
 2032)
 2033{
 2034#if defined(_XM_NO_INTRINSICS_)
 2035
 2036    XMVECTOR Control;
 2037    Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
 2038    Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
 2039    Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
 2040    Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
 2041    return Control;
 2042
 2043#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2044    return vcltq_f32( V1, V2 );
 2045#elif defined(_XM_SSE_INTRINSICS_)
 2046    return _mm_cmplt_ps( V1, V2 );
 2047#else // _XM_VMX128_INTRINSICS_
 2048#endif // _XM_VMX128_INTRINSICS_
 2049}
 2050
 2051//------------------------------------------------------------------------------
 2052
 2053inline XMVECTOR XMVectorLessOrEqual
 2054(
 2055    FXMVECTOR V1, 
 2056    FXMVECTOR V2
 2057)
 2058{
 2059#if defined(_XM_NO_INTRINSICS_)
 2060
 2061    XMVECTOR Control;
 2062    Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
 2063    Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
 2064    Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
 2065    Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
 2066    return Control;
 2067
 2068#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2069    return vcleq_f32( V1, V2 );
 2070#elif defined(_XM_SSE_INTRINSICS_)
 2071    return _mm_cmple_ps( V1, V2 );
 2072#else // _XM_VMX128_INTRINSICS_
 2073#endif // _XM_VMX128_INTRINSICS_
 2074}
 2075
 2076//------------------------------------------------------------------------------
 2077
 2078inline XMVECTOR XMVectorInBounds
 2079(
 2080    FXMVECTOR V, 
 2081    FXMVECTOR Bounds
 2082)
 2083{
 2084#if defined(_XM_NO_INTRINSICS_)
 2085
 2086    XMVECTOR Control;
 2087    Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0;
 2088    Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0;
 2089    Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0;
 2090    Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0;
 2091    return Control;
 2092
 2093#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2094    // Test if less than or equal
 2095    XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
 2096    // Negate the bounds
 2097    XMVECTOR vTemp2 = vnegq_f32(Bounds);
 2098    // Test if greater or equal (Reversed)
 2099    vTemp2 = vcleq_f32(vTemp2,V);
 2100    // Blend answers
 2101    vTemp1 = vandq_u32(vTemp1,vTemp2);
 2102    return vTemp1;
 2103#elif defined(_XM_SSE_INTRINSICS_)
 2104    // Test if less than or equal
 2105    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
 2106    // Negate the bounds
 2107    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
 2108    // Test if greater or equal (Reversed)
 2109    vTemp2 = _mm_cmple_ps(vTemp2,V);
 2110    // Blend answers
 2111    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
 2112    return vTemp1;
 2113#else // _XM_VMX128_INTRINSICS_
 2114#endif // _XM_VMX128_INTRINSICS_
 2115}
 2116
 2117//------------------------------------------------------------------------------
 2118
 2119_Use_decl_annotations_
 2120inline XMVECTOR XMVectorInBoundsR
 2121(
 2122    uint32_t*    pCR,
 2123    FXMVECTOR V, 
 2124    FXMVECTOR Bounds
 2125)
 2126{
 2127    assert( pCR != NULL );
 2128#if defined(_XM_NO_INTRINSICS_)
 2129
 2130    uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
 2131    uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
 2132    uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
 2133    uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
 2134
 2135    uint32_t CR = 0;
 2136    if (ux&uy&uz&uw)
 2137    {
 2138        // All elements are in bounds
 2139        CR = XM_CRMASK_CR6BOUNDS;
 2140    }
 2141    *pCR = CR;
 2142
 2143    XMVECTOR Control;
 2144    Control.vector4_u32[0] = ux;
 2145    Control.vector4_u32[1] = uy;
 2146    Control.vector4_u32[2] = uz;
 2147    Control.vector4_u32[3] = uw;
 2148    return Control;
 2149
 2150#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2151    // Test if less than or equal
 2152    XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
 2153    // Negate the bounds
 2154    XMVECTOR vTemp2 = vnegq_f32(Bounds);
 2155    // Test if greater or equal (Reversed)
 2156    vTemp2 = vcleq_f32(vTemp2,V);
 2157    // Blend answers
 2158    vTemp1 = vandq_u32(vTemp1,vTemp2);
 2159    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
 2160    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 2161    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 2162    uint32_t CR = 0;
 2163    if ( r == 0xFFFFFFFFU )
 2164    {
 2165        // All elements are in bounds
 2166        CR = XM_CRMASK_CR6BOUNDS;
 2167    }
 2168    *pCR = CR;
 2169    return vTemp1;
 2170#elif defined(_XM_SSE_INTRINSICS_)
 2171    // Test if less than or equal
 2172    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
 2173    // Negate the bounds
 2174    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
 2175    // Test if greater or equal (Reversed)
 2176    vTemp2 = _mm_cmple_ps(vTemp2,V);
 2177    // Blend answers
 2178    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
 2179
 2180    uint32_t CR = 0;
 2181    if (_mm_movemask_ps(vTemp1)==0xf) {
 2182        // All elements are in bounds
 2183        CR = XM_CRMASK_CR6BOUNDS;
 2184    }
 2185    *pCR = CR;
 2186    return vTemp1;
 2187#else // _XM_VMX128_INTRINSICS_
 2188#endif // _XM_VMX128_INTRINSICS_
 2189}
 2190
 2191//------------------------------------------------------------------------------
 2192
 2193inline XMVECTOR XMVectorIsNaN
 2194(
 2195    FXMVECTOR V
 2196)
 2197{
 2198#if defined(_XM_NO_INTRINSICS_)
 2199
 2200    XMVECTOR Control;
 2201    Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
 2202    Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
 2203    Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
 2204    Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
 2205    return Control;
 2206
 2207#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2208    // Test against itself. NaN is always not equal
 2209    __n128 vTempNan = vceqq_f32( V, V );
 2210    // Flip results
 2211    return vmvnq_u32( vTempNan );
 2212#elif defined(_XM_SSE_INTRINSICS_)
 2213    // Test against itself. NaN is always not equal
 2214    return _mm_cmpneq_ps(V,V);
 2215#else // _XM_VMX128_INTRINSICS_
 2216#endif // _XM_VMX128_INTRINSICS_
 2217}
 2218
 2219//------------------------------------------------------------------------------
 2220
 2221inline XMVECTOR XMVectorIsInfinite
 2222(
 2223    FXMVECTOR V
 2224)
 2225{
 2226#if defined(_XM_NO_INTRINSICS_)
 2227
 2228    XMVECTOR Control;
 2229    Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
 2230    Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
 2231    Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
 2232    Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
 2233    return Control;
 2234
 2235#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2236    // Mask off the sign bit
 2237    __n128 vTemp = vandq_u32(V,g_XMAbsMask);
 2238    // Compare to infinity
 2239    vTemp = vceqq_f32(vTemp,g_XMInfinity);
 2240    // If any are infinity, the signs are true.
 2241    return vTemp;
 2242#elif defined(_XM_SSE_INTRINSICS_)
 2243    // Mask off the sign bit
 2244    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
 2245    // Compare to infinity
 2246    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
 2247    // If any are infinity, the signs are true.
 2248    return vTemp;
 2249#else // _XM_VMX128_INTRINSICS_
 2250#endif // _XM_VMX128_INTRINSICS_
 2251}
 2252
 2253//------------------------------------------------------------------------------
 2254// Rounding and clamping operations
 2255//------------------------------------------------------------------------------
 2256
 2257//------------------------------------------------------------------------------
 2258
 2259inline XMVECTOR XMVectorMin
 2260(
 2261    FXMVECTOR V1, 
 2262    FXMVECTOR V2
 2263)
 2264{
 2265#if defined(_XM_NO_INTRINSICS_)
 2266
 2267    XMVECTOR Result;
 2268    Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
 2269    Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
 2270    Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
 2271    Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
 2272    return Result;
 2273
 2274#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2275    return vminq_f32( V1, V2 );
 2276#elif defined(_XM_SSE_INTRINSICS_)
 2277    return _mm_min_ps( V1, V2 );
 2278#else // _XM_VMX128_INTRINSICS_
 2279#endif // _XM_VMX128_INTRINSICS_
 2280}
 2281
 2282//------------------------------------------------------------------------------
 2283
 2284inline XMVECTOR XMVectorMax
 2285(
 2286    FXMVECTOR V1, 
 2287    FXMVECTOR V2
 2288)
 2289{
 2290#if defined(_XM_NO_INTRINSICS_)
 2291
 2292    XMVECTOR Result;
 2293    Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
 2294    Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
 2295    Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
 2296    Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
 2297    return Result;
 2298
 2299#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2300    return vmaxq_f32( V1, V2 );
 2301#elif defined(_XM_SSE_INTRINSICS_)
 2302    return _mm_max_ps( V1, V2 );
 2303#else // _XM_VMX128_INTRINSICS_
 2304#endif // _XM_VMX128_INTRINSICS_
 2305}
 2306
 2307//------------------------------------------------------------------------------
 2308
 2309inline XMVECTOR XMVectorRound
 2310(
 2311    FXMVECTOR V
 2312)
 2313{
 2314#if defined(_XM_NO_INTRINSICS_)
 2315
 2316    const XMVECTOR Zero = XMVectorZero();
 2317    const XMVECTOR BiasPos = XMVectorReplicate(0.5f);
 2318    const XMVECTOR BiasNeg = XMVectorReplicate(-0.5f);
 2319
 2320    XMVECTOR Bias = XMVectorLess(V, Zero);
 2321    Bias = XMVectorSelect(BiasPos, BiasNeg, Bias);
 2322    XMVECTOR Result = XMVectorAdd(V, Bias);
 2323    Result = XMVectorTruncate(Result);
 2324
 2325    return Result;
 2326
 2327#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2328    __n128 vTest = vabsq_f32( V );
 2329    vTest = vcltq_f32( vTest, g_XMNoFraction );
 2330
 2331    __n128 Bias = vcltq_f32( V, vdupq_n_u32(0) );
 2332
 2333    __n128 BiasPos = vdupq_n_f32( 0.5f );
 2334    __n128 BiasNeg = vdupq_n_f32( -0.5f );
 2335    Bias = vbslq_f32( Bias, BiasNeg, BiasPos );
 2336    __n128 V0 = vaddq_f32( V, Bias );
 2337    __n128 vInt = vcvtq_s32_f32( V0 );
 2338    __n128 vResult = vcvtq_f32_s32( vInt );
 2339
 2340    // All numbers less than 8388608 will use the round to int
 2341    // All others, use the ORIGINAL value
 2342    return vbslq_f32( vTest, vResult, V );
 2343#elif defined(_XM_SSE_INTRINSICS_)
 2344    // To handle NAN, INF and numbers greater than 8388608, use masking
 2345    // Get the abs value
 2346    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
 2347    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
 2348    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
 2349    // Convert to int and back to float for rounding
 2350    __m128i vInt = _mm_cvtps_epi32(V);
 2351    // Convert back to floats
 2352    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
 2353    // All numbers less than 8388608 will use the round to int
 2354    vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2355    // All others, use the ORIGINAL value
 2356    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
 2357    vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2358    return vResult;
 2359#else // _XM_VMX128_INTRINSICS_
 2360#endif // _XM_VMX128_INTRINSICS_
 2361}
 2362
 2363//------------------------------------------------------------------------------
 2364
 2365inline XMVECTOR XMVectorTruncate
 2366(
 2367    FXMVECTOR V
 2368)
 2369{
 2370#if defined(_XM_NO_INTRINSICS_)
 2371    XMVECTOR Result;
 2372    uint32_t     i;
 2373
 2374    // Avoid C4701
 2375    Result.vector4_f32[0] = 0.0f;
 2376
 2377    for (i = 0; i < 4; i++)
 2378    {
 2379        if (XMISNAN(V.vector4_f32[i]))
 2380        {
 2381            Result.vector4_u32[i] = 0x7FC00000;
 2382        }
 2383        else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
 2384        {
 2385            Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]);
 2386        }
 2387        else
 2388        {
 2389            Result.vector4_f32[i] = V.vector4_f32[i];
 2390        }
 2391    }
 2392    return Result;
 2393
 2394#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2395    __n128 vTest = vabsq_f32( V );
 2396    vTest = vcltq_f32( vTest, g_XMNoFraction );
 2397
 2398    __n128 vInt = vcvtq_s32_f32( V );
 2399    __n128 vResult = vcvtq_f32_s32( vInt );
 2400
 2401    // All numbers less than 8388608 will use the round to int
 2402    // All others, use the ORIGINAL value
 2403    return vbslq_f32( vTest, vResult, V );
 2404#elif defined(_XM_SSE_INTRINSICS_)
 2405    // To handle NAN, INF and numbers greater than 8388608, use masking
 2406    // Get the abs value
 2407    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
 2408    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
 2409    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
 2410    // Convert to int and back to float for rounding with truncation
 2411    __m128i vInt = _mm_cvttps_epi32(V);
 2412    // Convert back to floats
 2413    XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
 2414    // All numbers less than 8388608 will use the round to int
 2415    vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2416    // All others, use the ORIGINAL value
 2417    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
 2418    vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2419    return vResult;
 2420#else // _XM_VMX128_INTRINSICS_
 2421#endif // _XM_VMX128_INTRINSICS_
 2422}
 2423
 2424//------------------------------------------------------------------------------
 2425
 2426inline XMVECTOR XMVectorFloor
 2427(
 2428    FXMVECTOR V
 2429)
 2430{
 2431#if defined(_XM_NO_INTRINSICS_)
 2432
 2433    XMVECTOR vResult = {
 2434        floorf(V.vector4_f32[0]),
 2435        floorf(V.vector4_f32[1]),
 2436        floorf(V.vector4_f32[2]),
 2437        floorf(V.vector4_f32[3])
 2438    };
 2439    return vResult;
 2440
 2441#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2442    __n128 V0 = vsubq_f32( V, vdupq_n_u32(0x3EFFFFA0) );
 2443    return XMVectorRound(V0);
 2444#elif defined(_XM_SSE_INTRINSICS_)
 2445    // To handle NAN, INF and numbers greater than 8388608, use masking
 2446    // Get the abs value
 2447    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
 2448    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
 2449    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
 2450    // Convert to int and back to float for rounding
 2451    XMVECTOR vResult = _mm_sub_ps(V,g_XMOneHalfMinusEpsilon);
 2452    __m128i vInt = _mm_cvtps_epi32(vResult);
 2453    // Convert back to floats
 2454    vResult = _mm_cvtepi32_ps(vInt);
 2455    // All numbers less than 8388608 will use the round to int
 2456    vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2457    // All others, use the ORIGINAL value
 2458    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
 2459    vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2460    return vResult;
 2461#else // _XM_VMX128_INTRINSICS_
 2462#endif // _XM_VMX128_INTRINSICS_
 2463}
 2464
 2465//------------------------------------------------------------------------------
 2466
 2467inline XMVECTOR XMVectorCeiling
 2468(
 2469    FXMVECTOR V
 2470)
 2471{
 2472#if defined(_XM_NO_INTRINSICS_)
 2473    XMVECTOR vResult = {
 2474        ceilf(V.vector4_f32[0]),
 2475        ceilf(V.vector4_f32[1]),
 2476        ceilf(V.vector4_f32[2]),
 2477        ceilf(V.vector4_f32[3])
 2478    };
 2479    return vResult;
 2480
 2481#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2482    __n128 V0 = vaddq_f32( V, vdupq_n_u32(0x3EFFFFA0) );
 2483    return XMVectorRound(V0);
 2484#elif defined(_XM_SSE_INTRINSICS_)
 2485    // To handle NAN, INF and numbers greater than 8388608, use masking
 2486    // Get the abs value
 2487    __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
 2488    // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
 2489    vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
 2490    // Convert to int and back to float for rounding
 2491    XMVECTOR vResult = _mm_add_ps(V,g_XMOneHalfMinusEpsilon);
 2492    __m128i vInt = _mm_cvtps_epi32(vResult);
 2493    // Convert back to floats
 2494    vResult = _mm_cvtepi32_ps(vInt);
 2495    // All numbers less than 8388608 will use the round to int
 2496    vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2497    // All others, use the ORIGINAL value
 2498    vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
 2499    vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
 2500    return vResult;
 2501#else // _XM_VMX128_INTRINSICS_
 2502#endif // _XM_VMX128_INTRINSICS_
 2503}
 2504
 2505//------------------------------------------------------------------------------
 2506
 2507inline XMVECTOR XMVectorClamp
 2508(
 2509    FXMVECTOR V, 
 2510    FXMVECTOR Min, 
 2511    FXMVECTOR Max
 2512)
 2513{
 2514    assert(XMVector4LessOrEqual(Min, Max));
 2515
 2516#if defined(_XM_NO_INTRINSICS_)
 2517
 2518    XMVECTOR Result;
 2519    Result = XMVectorMax(Min, V);
 2520    Result = XMVectorMin(Max, Result);
 2521    return Result;
 2522
 2523#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2524    XMVECTOR vResult;
 2525    vResult = vmaxq_f32(Min,V);
 2526    vResult = vminq_f32(vResult,Max);
 2527    return vResult;
 2528#elif defined(_XM_SSE_INTRINSICS_)
 2529    XMVECTOR vResult;
 2530    vResult = _mm_max_ps(Min,V);
 2531    vResult = _mm_min_ps(vResult,Max);
 2532    return vResult;
 2533#else // _XM_VMX128_INTRINSICS_
 2534#endif // _XM_VMX128_INTRINSICS_
 2535}
 2536
 2537//------------------------------------------------------------------------------
 2538
 2539inline XMVECTOR XMVectorSaturate
 2540(
 2541    FXMVECTOR V
 2542)
 2543{
 2544#if defined(_XM_NO_INTRINSICS_)
 2545
 2546    const XMVECTOR Zero = XMVectorZero();
 2547
 2548    return XMVectorClamp(V, Zero, g_XMOne.v);
 2549
 2550#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2551    // Set <0 to 0
 2552    XMVECTOR vResult = vmaxq_f32(V, vdupq_n_u32(0) );
 2553    // Set>1 to 1
 2554    return vminq_f32(vResult, vdupq_n_f32(1.0f) );
 2555#elif defined(_XM_SSE_INTRINSICS_)
 2556    // Set <0 to 0
 2557    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
 2558    // Set>1 to 1
 2559    return _mm_min_ps(vResult,g_XMOne);
 2560#else // _XM_VMX128_INTRINSICS_
 2561#endif // _XM_VMX128_INTRINSICS_
 2562}
 2563
 2564//------------------------------------------------------------------------------
 2565// Bitwise logical operations
 2566//------------------------------------------------------------------------------
 2567
 2568inline XMVECTOR XMVectorAndInt
 2569(
 2570    FXMVECTOR V1,
 2571    FXMVECTOR V2
 2572)
 2573{
 2574#if defined(_XM_NO_INTRINSICS_)
 2575
 2576    XMVECTOR Result;
 2577    Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0];
 2578    Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1];
 2579    Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2];
 2580    Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3];
 2581    return Result;
 2582
 2583#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2584    return vandq_u32(V1,V2);
 2585#elif defined(_XM_SSE_INTRINSICS_)
 2586    return _mm_and_ps(V1,V2);
 2587#else // _XM_VMX128_INTRINSICS_
 2588#endif // _XM_VMX128_INTRINSICS_
 2589}
 2590
 2591//------------------------------------------------------------------------------
 2592
 2593inline XMVECTOR XMVectorAndCInt
 2594(
 2595    FXMVECTOR V1,
 2596    FXMVECTOR V2
 2597)
 2598{
 2599#if defined(_XM_NO_INTRINSICS_)
 2600
 2601    XMVECTOR Result;
 2602    Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0];
 2603    Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1];
 2604    Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2];
 2605    Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3];
 2606    return Result;
 2607
 2608#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2609    return vbicq_u32(V1,V2);
 2610#elif defined(_XM_SSE_INTRINSICS_)
 2611    __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) );
 2612    return reinterpret_cast<__m128 *>(&V)[0];
 2613#else // _XM_VMX128_INTRINSICS_
 2614#endif // _XM_VMX128_INTRINSICS_
 2615}
 2616
 2617//------------------------------------------------------------------------------
 2618
 2619inline XMVECTOR XMVectorOrInt
 2620(
 2621    FXMVECTOR V1,
 2622    FXMVECTOR V2
 2623)
 2624{
 2625#if defined(_XM_NO_INTRINSICS_)
 2626
 2627    XMVECTOR Result;
 2628    Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0];
 2629    Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1];
 2630    Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2];
 2631    Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3];
 2632    return Result;
 2633
 2634#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2635    return vorrq_u32(V1,V2);
 2636#elif defined(_XM_SSE_INTRINSICS_)
 2637    __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
 2638    return reinterpret_cast<__m128 *>(&V)[0];
 2639#else // _XM_VMX128_INTRINSICS_
 2640#endif // _XM_VMX128_INTRINSICS_
 2641}
 2642
 2643//------------------------------------------------------------------------------
 2644
 2645inline XMVECTOR XMVectorNorInt
 2646(
 2647    FXMVECTOR V1,
 2648    FXMVECTOR V2
 2649)
 2650{
 2651#if defined(_XM_NO_INTRINSICS_)
 2652
 2653    XMVECTOR Result;
 2654    Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]);
 2655    Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]);
 2656    Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]);
 2657    Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]);
 2658    return Result;
 2659
 2660#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2661    __n128 Result = vorrq_u32(V1,V2);
 2662    return vbicq_u32(g_XMNegOneMask, Result);
 2663#elif defined(_XM_SSE_INTRINSICS_)
 2664    __m128i Result;
 2665    Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
 2666    Result = _mm_andnot_si128( Result,g_XMNegOneMask);
 2667    return reinterpret_cast<__m128 *>(&Result)[0];
 2668#else // _XM_VMX128_INTRINSICS_
 2669#endif // _XM_VMX128_INTRINSICS_
 2670}
 2671
 2672//------------------------------------------------------------------------------
 2673
 2674inline XMVECTOR XMVectorXorInt
 2675(
 2676    FXMVECTOR V1,
 2677    FXMVECTOR V2
 2678)
 2679{
 2680#if defined(_XM_NO_INTRINSICS_)
 2681
 2682    XMVECTOR Result;
 2683    Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0];
 2684    Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1];
 2685    Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2];
 2686    Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3];
 2687    return Result;
 2688
 2689#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2690    return veorq_u32(V1,V2);
 2691#elif defined(_XM_SSE_INTRINSICS_)
 2692    __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
 2693    return reinterpret_cast<__m128 *>(&V)[0];
 2694#else // _XM_VMX128_INTRINSICS_
 2695#endif // _XM_VMX128_INTRINSICS_
 2696}
 2697
 2698//------------------------------------------------------------------------------
 2699// Computation operations
 2700//------------------------------------------------------------------------------
 2701
 2702//------------------------------------------------------------------------------
 2703
 2704inline XMVECTOR XMVectorNegate
 2705(
 2706    FXMVECTOR V
 2707)
 2708{
 2709#if defined(_XM_NO_INTRINSICS_)
 2710
 2711    XMVECTOR Result;
 2712    Result.vector4_f32[0] = -V.vector4_f32[0];
 2713    Result.vector4_f32[1] = -V.vector4_f32[1];
 2714    Result.vector4_f32[2] = -V.vector4_f32[2];
 2715    Result.vector4_f32[3] = -V.vector4_f32[3];
 2716    return Result;
 2717
 2718#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2719    return vnegq_f32(V);
 2720#elif defined(_XM_SSE_INTRINSICS_)
 2721    XMVECTOR Z;
 2722
 2723    Z = _mm_setzero_ps();
 2724
 2725    return _mm_sub_ps( Z, V );
 2726#else // _XM_VMX128_INTRINSICS_
 2727#endif // _XM_VMX128_INTRINSICS_
 2728}
 2729
 2730//------------------------------------------------------------------------------
 2731
 2732inline XMVECTOR XMVectorAdd
 2733(
 2734    FXMVECTOR V1, 
 2735    FXMVECTOR V2
 2736)
 2737{
 2738#if defined(_XM_NO_INTRINSICS_)
 2739
 2740    XMVECTOR Result;
 2741    Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0];
 2742    Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1];
 2743    Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2];
 2744    Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3];
 2745    return Result;
 2746
 2747#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2748    return vaddq_f32( V1, V2 );
 2749#elif defined(_XM_SSE_INTRINSICS_)
 2750    return _mm_add_ps( V1, V2 );
 2751#else // _XM_VMX128_INTRINSICS_
 2752#endif // _XM_VMX128_INTRINSICS_
 2753}
 2754
 2755//------------------------------------------------------------------------------
 2756
 2757inline XMVECTOR XMVectorAddAngles
 2758(
 2759    FXMVECTOR V1, 
 2760    FXMVECTOR V2
 2761)
 2762{
 2763#if defined(_XM_NO_INTRINSICS_)
 2764
 2765    const XMVECTOR Zero = XMVectorZero();
 2766
 2767    // Add the given angles together.  If the range of V1 is such
 2768    // that -Pi <= V1 < Pi and the range of V2 is such that
 2769    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
 2770    // will be -Pi <= Result < Pi.
 2771    XMVECTOR Result = XMVectorAdd(V1, V2);
 2772
 2773    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
 2774    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
 2775
 2776    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
 2777    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
 2778
 2779    Result = XMVectorAdd(Result, Offset);
 2780
 2781    return Result;
 2782
 2783#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2784    // Adjust the angles
 2785    __n128 vResult = vaddq_f32(V1,V2);
 2786    // Less than Pi?
 2787    __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi);
 2788    vOffset = vandq_u32(vOffset,g_XMTwoPi);
 2789    // Add 2Pi to all entries less than -Pi
 2790    vResult = vaddq_f32(vResult,vOffset);
 2791    // Greater than or equal to Pi?
 2792    vOffset = vcgeq_f32(vResult,g_XMPi);
 2793    vOffset = vandq_u32(vOffset,g_XMTwoPi);
 2794    // Sub 2Pi to all entries greater than Pi
 2795    vResult = vsubq_f32(vResult,vOffset);
 2796    return vResult;
 2797#elif defined(_XM_SSE_INTRINSICS_)
 2798    // Adjust the angles
 2799    XMVECTOR vResult = _mm_add_ps(V1,V2);
 2800    // Less than Pi?
 2801    XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
 2802    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
 2803    // Add 2Pi to all entries less than -Pi
 2804    vResult = _mm_add_ps(vResult,vOffset);
 2805    // Greater than or equal to Pi?
 2806    vOffset = _mm_cmpge_ps(vResult,g_XMPi);
 2807    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
 2808    // Sub 2Pi to all entries greater than Pi
 2809    vResult = _mm_sub_ps(vResult,vOffset);
 2810    return vResult;
 2811#else // _XM_VMX128_INTRINSICS_
 2812#endif // _XM_VMX128_INTRINSICS_
 2813}
 2814
 2815//------------------------------------------------------------------------------
 2816
 2817inline XMVECTOR XMVectorSubtract
 2818(
 2819    FXMVECTOR V1, 
 2820    FXMVECTOR V2
 2821)
 2822{
 2823#if defined(_XM_NO_INTRINSICS_)
 2824
 2825    XMVECTOR Result;
 2826    Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0];
 2827    Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1];
 2828    Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2];
 2829    Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3];
 2830    return Result;
 2831
 2832#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2833    return vsubq_f32( V1, V2 );
 2834#elif defined(_XM_SSE_INTRINSICS_)
 2835    return _mm_sub_ps( V1, V2 );
 2836#else // _XM_VMX128_INTRINSICS_
 2837#endif // _XM_VMX128_INTRINSICS_
 2838}
 2839
 2840//------------------------------------------------------------------------------
 2841
 2842inline XMVECTOR XMVectorSubtractAngles
 2843(
 2844    FXMVECTOR V1, 
 2845    FXMVECTOR V2
 2846)
 2847{
 2848#if defined(_XM_NO_INTRINSICS_)
 2849
 2850    const XMVECTOR Zero = XMVectorZero();
 2851
 2852    // Subtract the given angles.  If the range of V1 is such
 2853    // that -Pi <= V1 < Pi and the range of V2 is such that
 2854    // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
 2855    // will be -Pi <= Result < Pi.
 2856    XMVECTOR Result = XMVectorSubtract(V1, V2);
 2857
 2858    XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
 2859    XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
 2860
 2861    Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
 2862    Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
 2863
 2864    Result = XMVectorAdd(Result, Offset);
 2865
 2866    return Result;
 2867
 2868#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2869    // Adjust the angles
 2870    __n128 vResult = vsubq_f32(V1,V2);
 2871    // Less than Pi?
 2872    __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi);
 2873    vOffset = vandq_u32(vOffset,g_XMTwoPi);
 2874    // Add 2Pi to all entries less than -Pi
 2875    vResult = vaddq_f32(vResult,vOffset);
 2876    // Greater than or equal to Pi?
 2877    vOffset = vcgeq_f32(vResult,g_XMPi);
 2878    vOffset = vandq_u32(vOffset,g_XMTwoPi);
 2879    // Sub 2Pi to all entries greater than Pi
 2880    vResult = vsubq_f32(vResult,vOffset);
 2881    return vResult;
 2882#elif defined(_XM_SSE_INTRINSICS_)
 2883    // Adjust the angles
 2884    XMVECTOR vResult = _mm_sub_ps(V1,V2);
 2885    // Less than Pi?
 2886    XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
 2887    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
 2888    // Add 2Pi to all entries less than -Pi
 2889    vResult = _mm_add_ps(vResult,vOffset);
 2890    // Greater than or equal to Pi?
 2891    vOffset = _mm_cmpge_ps(vResult,g_XMPi);
 2892    vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
 2893    // Sub 2Pi to all entries greater than Pi
 2894    vResult = _mm_sub_ps(vResult,vOffset);
 2895    return vResult;
 2896#else // _XM_VMX128_INTRINSICS_
 2897#endif // _XM_VMX128_INTRINSICS_
 2898}
 2899
 2900//------------------------------------------------------------------------------
 2901
 2902inline XMVECTOR XMVectorMultiply
 2903(
 2904    FXMVECTOR V1, 
 2905    FXMVECTOR V2
 2906)
 2907{
 2908#if defined(_XM_NO_INTRINSICS_)
 2909    XMVECTOR Result = {
 2910        V1.vector4_f32[0] * V2.vector4_f32[0],
 2911        V1.vector4_f32[1] * V2.vector4_f32[1],
 2912        V1.vector4_f32[2] * V2.vector4_f32[2],
 2913        V1.vector4_f32[3] * V2.vector4_f32[3]
 2914    };
 2915    return Result;
 2916#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2917    return vmulq_f32( V1, V2 );
 2918#elif defined(_XM_SSE_INTRINSICS_)
 2919    return _mm_mul_ps( V1, V2 );
 2920#else // _XM_VMX128_INTRINSICS_
 2921#endif // _XM_VMX128_INTRINSICS_
 2922}
 2923
 2924//------------------------------------------------------------------------------
 2925
 2926inline XMVECTOR XMVectorMultiplyAdd
 2927(
 2928    FXMVECTOR V1, 
 2929    FXMVECTOR V2, 
 2930    FXMVECTOR V3
 2931)
 2932{
 2933#if defined(_XM_NO_INTRINSICS_)
 2934    XMVECTOR vResult = {
 2935        (V1.vector4_f32[0] * V2.vector4_f32[0]) + V3.vector4_f32[0],
 2936        (V1.vector4_f32[1] * V2.vector4_f32[1]) + V3.vector4_f32[1],
 2937        (V1.vector4_f32[2] * V2.vector4_f32[2]) + V3.vector4_f32[2],
 2938        (V1.vector4_f32[3] * V2.vector4_f32[3]) + V3.vector4_f32[3]
 2939    };
 2940    return vResult;
 2941
 2942#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2943    return vmlaq_f32( V3, V1, V2 );
 2944#elif defined(_XM_SSE_INTRINSICS_)
 2945    XMVECTOR vResult = _mm_mul_ps( V1, V2 );
 2946    return _mm_add_ps(vResult, V3 );
 2947#else // _XM_VMX128_INTRINSICS_
 2948#endif // _XM_VMX128_INTRINSICS_
 2949}
 2950
 2951//------------------------------------------------------------------------------
 2952
 2953inline XMVECTOR XMVectorDivide
 2954(
 2955    FXMVECTOR V1, 
 2956    FXMVECTOR V2
 2957)
 2958{
 2959#if defined(_XM_NO_INTRINSICS_)
 2960    XMVECTOR Result;
 2961    Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0];
 2962    Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1];
 2963    Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2];
 2964    Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
 2965    return Result;
 2966#elif defined(_XM_ARM_NEON_INTRINSICS_)
 2967    // 2 iterations of Newton-Raphson refinement of reciprocal
 2968    __n128 Reciprocal = vrecpeq_f32(V2);
 2969    __n128 S = vrecpsq_f32( Reciprocal, V2 );
 2970    Reciprocal = vmulq_f32( S, Reciprocal );
 2971    S = vrecpsq_f32( Reciprocal, V2 );
 2972    Reciprocal = vmulq_f32( S, Reciprocal );
 2973    return vmulq_f32( V1, Reciprocal );
 2974#elif defined(_XM_SSE_INTRINSICS_)
 2975    return _mm_div_ps( V1, V2 );
 2976#else // _XM_VMX128_INTRINSICS_
 2977#endif // _XM_VMX128_INTRINSICS_
 2978}
 2979
 2980//------------------------------------------------------------------------------
 2981
 2982inline XMVECTOR XMVectorNegativeMultiplySubtract
 2983(
 2984    FXMVECTOR V1, 
 2985    FXMVECTOR V2, 
 2986    FXMVECTOR V3
 2987)
 2988{
 2989#if defined(_XM_NO_INTRINSICS_)
 2990
 2991    XMVECTOR vResult = {
 2992        V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]),
 2993        V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]),
 2994        V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]),
 2995        V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3])
 2996    };
 2997    return vResult;
 2998
 2999#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3000    return vmlsq_f32( V3, V1, V2 );
 3001#elif defined(_XM_SSE_INTRINSICS_)
 3002    XMVECTOR R = _mm_mul_ps( V1, V2 );
 3003    return _mm_sub_ps( V3, R );
 3004#else // _XM_VMX128_INTRINSICS_
 3005#endif // _XM_VMX128_INTRINSICS_
 3006}
 3007
 3008//------------------------------------------------------------------------------
 3009
 3010inline XMVECTOR XMVectorScale
 3011(
 3012    FXMVECTOR V, 
 3013    float    ScaleFactor
 3014)
 3015{
 3016#if defined(_XM_NO_INTRINSICS_)
 3017    XMVECTOR vResult = {
 3018        V.vector4_f32[0] * ScaleFactor,
 3019        V.vector4_f32[1] * ScaleFactor,
 3020        V.vector4_f32[2] * ScaleFactor,
 3021        V.vector4_f32[3] * ScaleFactor
 3022    };
 3023    return vResult;
 3024
 3025#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3026    return vmulq_n_f32( V, ScaleFactor );
 3027#elif defined(_XM_SSE_INTRINSICS_)
 3028   XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
 3029   return _mm_mul_ps(vResult,V);
 3030#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 3031#endif // _XM_VMX128_INTRINSICS_
 3032}
 3033
 3034//------------------------------------------------------------------------------
 3035
 3036inline XMVECTOR XMVectorReciprocalEst
 3037(
 3038    FXMVECTOR V
 3039)
 3040{
 3041#if defined(_XM_NO_INTRINSICS_)
 3042    XMVECTOR Result;
 3043    Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
 3044    Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
 3045    Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
 3046    Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
 3047    return Result;
 3048#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3049    return vrecpeq_f32(V);
 3050#elif defined(_XM_SSE_INTRINSICS_)
 3051    return _mm_rcp_ps(V);
 3052#else // _XM_VMX128_INTRINSICS_
 3053#endif // _XM_VMX128_INTRINSICS_
 3054}
 3055
 3056//------------------------------------------------------------------------------
 3057
 3058inline XMVECTOR XMVectorReciprocal
 3059(
 3060    FXMVECTOR V
 3061)
 3062{
 3063#if defined(_XM_NO_INTRINSICS_)
 3064    XMVECTOR Result;
 3065    Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
 3066    Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
 3067    Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
 3068    Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
 3069    return Result;
 3070#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3071    // 2 iterations of Newton-Raphson refinement
 3072    __n128 Reciprocal = vrecpeq_f32(V);
 3073    __n128 S = vrecpsq_f32( Reciprocal, V );
 3074    Reciprocal = vmulq_f32( S, Reciprocal );
 3075    S = vrecpsq_f32( Reciprocal, V );
 3076    return vmulq_f32( S, Reciprocal );
 3077#elif defined(_XM_SSE_INTRINSICS_)
 3078    return _mm_div_ps(g_XMOne,V);
 3079#else // _XM_VMX128_INTRINSICS_
 3080#endif // _XM_VMX128_INTRINSICS_
 3081}
 3082
 3083//------------------------------------------------------------------------------
 3084// Return an estimated square root
 3085inline XMVECTOR XMVectorSqrtEst
 3086(
 3087    FXMVECTOR V
 3088)
 3089{
 3090#if defined(_XM_NO_INTRINSICS_)
 3091    XMVECTOR Result;
 3092    Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
 3093    Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
 3094    Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
 3095    Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
 3096    return Result;
 3097#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3098    // 1 iteration of Newton-Raphson refinment of sqrt
 3099    __n128 S0 = vrsqrteq_f32(V);
 3100    __n128 P0 = vmulq_f32( V, S0 );
 3101    __n128 R0 = vrsqrtsq_f32( P0, S0 );
 3102    __n128 S1 = vmulq_f32( S0, R0 );
 3103
 3104    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
 3105    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
 3106    __n128 Result = vmulq_f32( V, S1 );
 3107    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
 3108    return XMVectorSelect(V, Result, Select);
 3109#elif defined(_XM_SSE_INTRINSICS_)
 3110    return _mm_sqrt_ps(V);
 3111#else // _XM_VMX128_INTRINSICS_
 3112#endif // _XM_VMX128_INTRINSICS_
 3113}
 3114
 3115//------------------------------------------------------------------------------
 3116
 3117inline XMVECTOR XMVectorSqrt
 3118(
 3119    FXMVECTOR V
 3120)
 3121{
 3122#if defined(_XM_NO_INTRINSICS_)
 3123    XMVECTOR Result;
 3124    Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
 3125    Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
 3126    Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
 3127    Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
 3128    return Result;
 3129#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3130    // 3 iterations of Newton-Raphson refinment of sqrt
 3131    __n128 S0 = vrsqrteq_f32(V);
 3132    __n128 P0 = vmulq_f32( V, S0 );
 3133    __n128 R0 = vrsqrtsq_f32( P0, S0 );
 3134    __n128 S1 = vmulq_f32( S0, R0 );
 3135    __n128 P1 = vmulq_f32( V, S1 );
 3136    __n128 R1 = vrsqrtsq_f32( P1, S1 );
 3137    __n128 S2 = vmulq_f32( S1, R1 );
 3138    __n128 P2 = vmulq_f32( V, S2 );
 3139    __n128 R2 = vrsqrtsq_f32( P2, S2 );
 3140    __n128 S3 = vmulq_f32( S2, R2 );
 3141
 3142    XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
 3143    XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
 3144    __n128 Result = vmulq_f32( V, S3 );
 3145    XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
 3146    return XMVectorSelect(V, Result, Select);
 3147#elif defined(_XM_SSE_INTRINSICS_)
 3148    return _mm_sqrt_ps(V);
 3149#else // _XM_VMX128_INTRINSICS_
 3150#endif // _XM_VMX128_INTRINSICS_
 3151}
 3152
 3153//------------------------------------------------------------------------------
 3154
 3155inline XMVECTOR XMVectorReciprocalSqrtEst
 3156(
 3157    FXMVECTOR V
 3158)
 3159{
 3160#if defined(_XM_NO_INTRINSICS_)
 3161    XMVECTOR Result;
 3162    Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
 3163    Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
 3164    Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
 3165    Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
 3166    return Result;
 3167#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3168    return vrsqrteq_f32(V);
 3169#elif defined(_XM_SSE_INTRINSICS_)
 3170    return _mm_rsqrt_ps(V);
 3171#else // _XM_VMX128_INTRINSICS_
 3172#endif // _XM_VMX128_INTRINSICS_
 3173}
 3174
 3175//------------------------------------------------------------------------------
 3176
 3177inline XMVECTOR XMVectorReciprocalSqrt
 3178(
 3179    FXMVECTOR V
 3180)
 3181{
 3182#if defined(_XM_NO_INTRINSICS_)
 3183    XMVECTOR Result;
 3184    Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
 3185    Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
 3186    Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
 3187    Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
 3188    return Result;
 3189#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3190    // 2 iterations of Newton-Raphson refinement of reciprocal
 3191    __n128 S0 = vrsqrteq_f32(V);
 3192
 3193    __n128 P0 = vmulq_f32( V, S0 );
 3194    __n128 R0 = vrsqrtsq_f32( P0, S0 );
 3195
 3196    __n128 S1 = vmulq_f32( S0, R0 );
 3197    __n128 P1 = vmulq_f32( V, S1 );
 3198    __n128 R1 = vrsqrtsq_f32( P1, S1 );
 3199
 3200    return vmulq_f32( S1, R1 );
 3201#elif defined(_XM_SSE_INTRINSICS_)
 3202    XMVECTOR vResult = _mm_sqrt_ps(V);
 3203    vResult = _mm_div_ps(g_XMOne,vResult);
 3204    return vResult;
 3205#else // _XM_VMX128_INTRINSICS_
 3206#endif // _XM_VMX128_INTRINSICS_
 3207}
 3208
 3209
 3210//------------------------------------------------------------------------------
 3211
 3212inline XMVECTOR XMVectorExp
 3213(
 3214    FXMVECTOR V
 3215)
 3216{
 3217#if defined(_XM_NO_INTRINSICS_)
 3218
 3219    XMVECTOR Result;
 3220    Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]);
 3221    Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]);
 3222    Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]);
 3223    Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]);
 3224    return Result;
 3225
 3226#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3227    XMVECTORF32 vResult = {
 3228        powf(2.0f,vgetq_lane_f32(V, 0)),
 3229        powf(2.0f,vgetq_lane_f32(V, 1)),
 3230        powf(2.0f,vgetq_lane_f32(V, 2)),
 3231        powf(2.0f,vgetq_lane_f32(V, 3))
 3232    };
 3233    return vResult;
 3234#elif defined(_XM_SSE_INTRINSICS_)
 3235    __declspec(align(16)) float a[4];
 3236    _mm_store_ps( a, V );
 3237    XMVECTOR vResult = _mm_setr_ps(
 3238        powf(2.0f,a[0]),
 3239        powf(2.0f,a[1]),
 3240        powf(2.0f,a[2]),
 3241        powf(2.0f,a[3]));
 3242    return vResult;
 3243#else // _XM_VMX128_INTRINSICS_
 3244#endif // _XM_VMX128_INTRINSICS_
 3245}
 3246
 3247
 3248//------------------------------------------------------------------------------
 3249
 3250inline XMVECTOR XMVectorLog
 3251(
 3252    FXMVECTOR V
 3253)
 3254{
 3255#if defined(_XM_NO_INTRINSICS_)
 3256
 3257    const float fScale = 1.4426950f; // (1.0f / logf(2.0f));
 3258
 3259    XMVECTOR Result;
 3260    Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale;
 3261    Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale;
 3262    Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale;
 3263    Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale;
 3264    return Result;
 3265
 3266#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3267    XMVECTOR vScale = vdupq_n_f32(1.0f / logf(2.0f));
 3268    XMVECTORF32 vResult = {
 3269        logf(vgetq_lane_f32(V, 0)),
 3270        logf(vgetq_lane_f32(V, 1)),
 3271        logf(vgetq_lane_f32(V, 2)),
 3272        logf(vgetq_lane_f32(V, 3))
 3273    };
 3274    return vmulq_f32( vResult, vScale );
 3275#elif defined(_XM_SSE_INTRINSICS_)
 3276    __declspec(align(16)) float a[4];
 3277    _mm_store_ps( a, V );
 3278    XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f));
 3279    XMVECTOR vResult = _mm_setr_ps(
 3280        logf(a[0]),
 3281        logf(a[1]),
 3282        logf(a[2]),
 3283        logf(a[3]));
 3284    vResult = _mm_mul_ps(vResult,vScale);
 3285    return vResult;
 3286#else // _XM_VMX128_INTRINSICS_
 3287#endif // _XM_VMX128_INTRINSICS_
 3288}
 3289
 3290
 3291//------------------------------------------------------------------------------
 3292
 3293inline XMVECTOR XMVectorPow
 3294(
 3295    FXMVECTOR V1,
 3296    FXMVECTOR V2
 3297)
 3298{
 3299#if defined(_XM_NO_INTRINSICS_)
 3300
 3301    XMVECTOR Result;
 3302    Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]);
 3303    Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]);
 3304    Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]);
 3305    Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]);
 3306    return Result;
 3307
 3308#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3309    XMVECTORF32 vResult = {
 3310        powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
 3311        powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
 3312        powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
 3313        powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))
 3314    };
 3315    return vResult;
 3316#elif defined(_XM_SSE_INTRINSICS_)
 3317    __declspec(align(16)) float a[4];
 3318    __declspec(align(16)) float b[4];
 3319    _mm_store_ps( a, V1 );
 3320    _mm_store_ps( b, V2 );
 3321    XMVECTOR vResult = _mm_setr_ps(
 3322        powf(a[0],b[0]),
 3323        powf(a[1],b[1]),
 3324        powf(a[2],b[2]),
 3325        powf(a[3],b[3]));
 3326    return vResult;
 3327#else // _XM_VMX128_INTRINSICS_
 3328#endif // _XM_VMX128_INTRINSICS_
 3329}
 3330
 3331//------------------------------------------------------------------------------
 3332
 3333inline XMVECTOR XMVectorAbs
 3334(
 3335    FXMVECTOR V
 3336)
 3337{
 3338#if defined(_XM_NO_INTRINSICS_)
 3339    XMVECTOR vResult = {
 3340        fabsf(V.vector4_f32[0]),
 3341        fabsf(V.vector4_f32[1]),
 3342        fabsf(V.vector4_f32[2]),
 3343        fabsf(V.vector4_f32[3])
 3344    };
 3345    return vResult;
 3346
 3347#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3348    return vabsq_f32( V );
 3349#elif defined(_XM_SSE_INTRINSICS_)
 3350    XMVECTOR vResult = _mm_setzero_ps();
 3351    vResult = _mm_sub_ps(vResult,V);
 3352    vResult = _mm_max_ps(vResult,V);
 3353    return vResult;
 3354#else // _XM_VMX128_INTRINSICS_
 3355#endif // _XM_VMX128_INTRINSICS_
 3356}
 3357
 3358//------------------------------------------------------------------------------
 3359
 3360inline XMVECTOR XMVectorMod
 3361(
 3362    FXMVECTOR V1, 
 3363    FXMVECTOR V2
 3364)
 3365{
 3366    // V1 % V2 = V1 - V2 * truncate(V1 / V2)
 3367
 3368#if defined(_XM_NO_INTRINSICS_)
 3369
 3370    XMVECTOR Quotient = XMVectorDivide(V1, V2);
 3371    Quotient = XMVectorTruncate(Quotient);
 3372    XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
 3373    return Result;
 3374
 3375#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3376    XMVECTOR vResult = XMVectorDivide(V1, V2);
 3377    vResult = XMVectorTruncate(vResult);
 3378    return vmlsq_f32( V1, vResult, V2 );
 3379#elif defined(_XM_SSE_INTRINSICS_)
 3380    XMVECTOR vResult = _mm_div_ps(V1, V2);
 3381    vResult = XMVectorTruncate(vResult);
 3382    vResult = _mm_mul_ps(vResult,V2);
 3383    vResult = _mm_sub_ps(V1,vResult);
 3384    return vResult;
 3385#else // _XM_VMX128_INTRINSICS_
 3386#endif // _XM_VMX128_INTRINSICS_
 3387}
 3388
 3389//------------------------------------------------------------------------------
 3390
 3391inline XMVECTOR XMVectorModAngles
 3392(
 3393    FXMVECTOR Angles
 3394)
 3395{
 3396#if defined(_XM_NO_INTRINSICS_)
 3397
 3398    XMVECTOR V;
 3399    XMVECTOR Result;
 3400
 3401    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
 3402    V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
 3403    V = XMVectorRound(V);
 3404    Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
 3405    return Result;
 3406
 3407#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3408    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
 3409    XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi);
 3410    // Use the inline function due to complexity for rounding
 3411    vResult = XMVectorRound(vResult);
 3412    return vmlsq_f32( Angles, vResult, g_XMTwoPi );
 3413#elif defined(_XM_SSE_INTRINSICS_)
 3414    // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
 3415    XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi);
 3416    // Use the inline function due to complexity for rounding
 3417    vResult = XMVectorRound(vResult);
 3418    vResult = _mm_mul_ps(vResult,g_XMTwoPi);
 3419    vResult = _mm_sub_ps(Angles,vResult);
 3420    return vResult;
 3421#else // _XM_VMX128_INTRINSICS_
 3422#endif // _XM_VMX128_INTRINSICS_
 3423}
 3424
 3425//------------------------------------------------------------------------------
 3426
 3427inline XMVECTOR XMVectorSin
 3428(
 3429    FXMVECTOR V
 3430)
 3431{
 3432    // 11-degree minimax approximation
 3433
 3434#if defined(_XM_NO_INTRINSICS_)
 3435    XMVECTOR Result;
 3436    Result.vector4_f32[0] = XMScalarSin( V.vector4_f32[0] );
 3437    Result.vector4_f32[1] = XMScalarSin( V.vector4_f32[1] );
 3438    Result.vector4_f32[2] = XMScalarSin( V.vector4_f32[2] );
 3439    Result.vector4_f32[3] = XMScalarSin( V.vector4_f32[3] );
 3440    return Result;
 3441#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3442    // Force the value within the bounds of pi
 3443    XMVECTOR x = XMVectorModAngles(V);
 3444
 3445    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
 3446    __n128 sign = vandq_u32(x, g_XMNegativeZero);
 3447    __n128 c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 3448    __n128 absx = vabsq_f32( x );
 3449    __n128 rflx = vsubq_f32(c, x);
 3450    __n128 comp = vcleq_f32(absx, g_XMHalfPi);
 3451    x = vbslq_f32( comp, x, rflx );
 3452
 3453    __n128 x2 = vmulq_f32(x, x);
 3454
 3455    // Compute polynomial approximation
 3456    const XMVECTOR SC1 = g_XMSinCoefficients1;
 3457    XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0);
 3458
 3459    const XMVECTOR SC0 = g_XMSinCoefficients0;
 3460    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
 3461    Result = vmlaq_f32(vConstants, Result, x2);
 3462
 3463    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
 3464    Result = vmlaq_f32(vConstants, Result, x2);
 3465
 3466    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
 3467    Result = vmlaq_f32(vConstants, Result, x2);
 3468
 3469    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
 3470    Result = vmlaq_f32(vConstants, Result, x2);
 3471
 3472    Result = vmlaq_f32(g_XMOne, Result, x2);
 3473    Result = vmulq_f32(Result, x);
 3474    return Result;
 3475#elif defined(_XM_SSE_INTRINSICS_)
 3476    // Force the value within the bounds of pi
 3477    XMVECTOR x = XMVectorModAngles(V);
 3478
 3479    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
 3480    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
 3481    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 3482    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
 3483    __m128 rflx = _mm_sub_ps(c, x);
 3484    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
 3485    __m128 select0 = _mm_and_ps(comp, x);
 3486    __m128 select1 = _mm_andnot_ps(comp, rflx);
 3487    x = _mm_or_ps(select0, select1);
 3488
 3489    __m128 x2 = _mm_mul_ps(x, x);
 3490
 3491    // Compute polynomial approximation
 3492    const XMVECTOR SC1 = g_XMSinCoefficients1;
 3493    XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
 3494    __m128 Result = _mm_mul_ps(vConstants, x2);
 3495
 3496    const XMVECTOR SC0 = g_XMSinCoefficients0;
 3497    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
 3498    Result = _mm_add_ps(Result, vConstants);
 3499    Result = _mm_mul_ps(Result, x2);
 3500
 3501    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
 3502    Result = _mm_add_ps(Result, vConstants);
 3503    Result = _mm_mul_ps(Result, x2);
 3504
 3505    vConstants = XM_PERMUTE_PS( SC0,  _MM_SHUFFLE(1, 1, 1, 1) );
 3506    Result = _mm_add_ps(Result, vConstants);
 3507    Result = _mm_mul_ps(Result, x2);
 3508
 3509    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
 3510    Result = _mm_add_ps(Result, vConstants);
 3511    Result = _mm_mul_ps(Result, x2);
 3512    Result = _mm_add_ps(Result, g_XMOne);
 3513    Result = _mm_mul_ps(Result, x);
 3514    return Result;
 3515#else // _XM_VMX128_INTRINSICS_
 3516#endif // _XM_VMX128_INTRINSICS_
 3517}
 3518
 3519//------------------------------------------------------------------------------
 3520
 3521inline XMVECTOR XMVectorCos
 3522(
 3523    FXMVECTOR V
 3524)
 3525{
 3526    // 10-degree minimax approximation
 3527
 3528#if defined(_XM_NO_INTRINSICS_)
 3529    XMVECTOR Result;
 3530    Result.vector4_f32[0] = XMScalarCos( V.vector4_f32[0] );
 3531    Result.vector4_f32[1] = XMScalarCos( V.vector4_f32[1] );
 3532    Result.vector4_f32[2] = XMScalarCos( V.vector4_f32[2] );
 3533    Result.vector4_f32[3] = XMScalarCos( V.vector4_f32[3] );
 3534    return Result;
 3535#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3536    // Map V to x in [-pi,pi].
 3537    XMVECTOR x = XMVectorModAngles(V);
 3538
 3539    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
 3540    __n128 sign = vandq_u32(x, g_XMNegativeZero);
 3541    __n128 c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 3542    __n128 absx = vabsq_f32( x );
 3543    __n128 rflx = vsubq_f32(c, x);
 3544    __n128 comp = vcleq_f32(absx, g_XMHalfPi);
 3545    x = vbslq_f32( comp, x, rflx );
 3546    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
 3547
 3548    __n128 x2 = vmulq_f32(x, x);
 3549
 3550    // Compute polynomial approximation
 3551    const XMVECTOR CC1 = g_XMCosCoefficients1;
 3552    XMVECTOR Result = vdupq_lane_f32(vget_low_f32(CC1), 0);
 3553
 3554    const XMVECTOR CC0 = g_XMCosCoefficients0;
 3555    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
 3556    Result = vmlaq_f32(vConstants, Result, x2);
 3557
 3558    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
 3559    Result = vmlaq_f32(vConstants, Result, x2);
 3560
 3561    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
 3562    Result = vmlaq_f32(vConstants, Result, x2);
 3563
 3564    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
 3565    Result = vmlaq_f32(vConstants, Result, x2);
 3566
 3567    Result = vmlaq_f32(g_XMOne, Result, x2);
 3568    Result = vmulq_f32(Result, sign);
 3569    return Result;
 3570#elif defined(_XM_SSE_INTRINSICS_)
 3571    // Map V to x in [-pi,pi].
 3572    XMVECTOR x = XMVectorModAngles(V);
 3573
 3574    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
 3575    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
 3576    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 3577    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
 3578    __m128 rflx = _mm_sub_ps(c, x);
 3579    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
 3580    __m128 select0 = _mm_and_ps(comp, x);
 3581    __m128 select1 = _mm_andnot_ps(comp, rflx);
 3582    x = _mm_or_ps(select0, select1);
 3583    select0 = _mm_and_ps(comp, g_XMOne);
 3584    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
 3585    sign = _mm_or_ps(select0, select1);
 3586
 3587    __m128 x2 = _mm_mul_ps(x, x);
 3588
 3589    // Compute polynomial approximation
 3590    const XMVECTOR CC1 = g_XMCosCoefficients1;
 3591    XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
 3592    __m128 Result = _mm_mul_ps(vConstants, x2);
 3593
 3594    const XMVECTOR CC0 = g_XMCosCoefficients0;
 3595    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
 3596    Result = _mm_add_ps(Result, vConstants);
 3597    Result = _mm_mul_ps(Result, x2);
 3598
 3599    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
 3600    Result = _mm_add_ps(Result, vConstants);
 3601    Result = _mm_mul_ps(Result, x2);
 3602
 3603    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
 3604    Result = _mm_add_ps(Result, vConstants);
 3605    Result = _mm_mul_ps(Result, x2);
 3606
 3607    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
 3608    Result = _mm_add_ps(Result, vConstants);
 3609    Result = _mm_mul_ps(Result, x2);
 3610    Result = _mm_add_ps(Result, g_XMOne);
 3611    Result = _mm_mul_ps(Result, sign);
 3612    return Result;
 3613#else // _XM_VMX128_INTRINSICS_
 3614#endif // _XM_VMX128_INTRINSICS_
 3615}
 3616
 3617//------------------------------------------------------------------------------
 3618
 3619_Use_decl_annotations_
 3620inline void XMVectorSinCos
 3621(
 3622    XMVECTOR* pSin, 
 3623    XMVECTOR* pCos, 
 3624    FXMVECTOR V
 3625)
 3626{
 3627    assert(pSin != NULL);
 3628    assert(pCos != NULL);
 3629
 3630    // 11/10-degree minimax approximation
 3631
 3632#if defined(_XM_NO_INTRINSICS_)
 3633    XMVECTOR Sin;
 3634    XMVECTOR Cos;
 3635
 3636    XMScalarSinCos(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]);
 3637    XMScalarSinCos(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]);
 3638    XMScalarSinCos(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]);
 3639    XMScalarSinCos(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]);
 3640
 3641    *pSin = Sin;
 3642    *pCos = Cos;
 3643#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3644    // Force the value within the bounds of pi
 3645    XMVECTOR x = XMVectorModAngles(V);
 3646
 3647    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
 3648    __n128 sign = vandq_u32(x, g_XMNegativeZero);
 3649    __n128 c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 3650    __n128 absx = vabsq_f32( x );
 3651    __n128 rflx = vsubq_f32(c, x);
 3652    __n128 comp = vcleq_f32(absx, g_XMHalfPi);
 3653    x = vbslq_f32( comp, x, rflx );
 3654    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
 3655
 3656    __n128 x2 = vmulq_f32(x, x);
 3657
 3658    // Compute polynomial approximation for sine
 3659    const XMVECTOR SC1 = g_XMSinCoefficients1;
 3660    XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0);
 3661
 3662    const XMVECTOR SC0 = g_XMSinCoefficients0;
 3663    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
 3664    Result = vmlaq_f32(vConstants, Result, x2);
 3665
 3666    vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
 3667    Result = vmlaq_f32(vConstants, Result, x2);
 3668
 3669    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
 3670    Result = vmlaq_f32(vConstants, Result, x2);
 3671
 3672    vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
 3673    Result = vmlaq_f32(vConstants, Result, x2);
 3674
 3675    Result = vmlaq_f32(g_XMOne, Result, x2);
 3676    *pSin = vmulq_f32(Result, x);
 3677
 3678    // Compute polynomial approximation for cosine
 3679    const XMVECTOR CC1 = g_XMCosCoefficients1;
 3680    Result = vdupq_lane_f32(vget_low_f32(CC1), 0);
 3681
 3682    const XMVECTOR CC0 = g_XMCosCoefficients0;
 3683    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
 3684    Result = vmlaq_f32(vConstants, Result, x2);
 3685
 3686    vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
 3687    Result = vmlaq_f32(vConstants, Result, x2);
 3688
 3689    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
 3690    Result = vmlaq_f32(vConstants, Result, x2);
 3691
 3692    vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
 3693    Result = vmlaq_f32(vConstants, Result, x2);
 3694
 3695    Result = vmlaq_f32(g_XMOne, Result, x2);
 3696    *pCos = vmulq_f32(Result, sign);
 3697#elif defined(_XM_SSE_INTRINSICS_)
 3698    // Force the value within the bounds of pi
 3699    XMVECTOR x = XMVectorModAngles(V);
 3700
 3701    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
 3702    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
 3703    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 3704    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
 3705    __m128 rflx = _mm_sub_ps(c, x);
 3706    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
 3707    __m128 select0 = _mm_and_ps(comp, x);
 3708    __m128 select1 = _mm_andnot_ps(comp, rflx);
 3709    x = _mm_or_ps(select0, select1);
 3710    select0 = _mm_and_ps(comp, g_XMOne);
 3711    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
 3712    sign = _mm_or_ps(select0, select1);
 3713
 3714    __m128 x2 = _mm_mul_ps(x, x);
 3715
 3716    // Compute polynomial approximation of sine
 3717    const XMVECTOR SC1 = g_XMSinCoefficients1;
 3718    XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
 3719    __m128 Result = _mm_mul_ps(vConstants, x2);
 3720
 3721    const XMVECTOR SC0 = g_XMSinCoefficients0;
 3722    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
 3723    Result = _mm_add_ps(Result, vConstants);
 3724    Result = _mm_mul_ps(Result, x2);
 3725
 3726    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
 3727    Result = _mm_add_ps(Result, vConstants);
 3728    Result = _mm_mul_ps(Result, x2);
 3729
 3730    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
 3731    Result = _mm_add_ps(Result, vConstants);
 3732    Result = _mm_mul_ps(Result, x2);
 3733
 3734    vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
 3735    Result = _mm_add_ps(Result, vConstants);
 3736    Result = _mm_mul_ps(Result, x2);
 3737    Result = _mm_add_ps(Result, g_XMOne);
 3738    Result = _mm_mul_ps(Result, x);
 3739    *pSin = Result;
 3740
 3741    // Compute polynomial approximation of cosine
 3742    const XMVECTOR CC1 = g_XMCosCoefficients1;
 3743    vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
 3744    Result = _mm_mul_ps(vConstants, x2);
 3745
 3746    const XMVECTOR CC0 = g_XMCosCoefficients0;
 3747    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
 3748    Result = _mm_add_ps(Result, vConstants);
 3749    Result = _mm_mul_ps(Result, x2);
 3750
 3751    vConstants = XM_PERMUTE_PS( CC0,  _MM_SHUFFLE(2, 2, 2, 2) );
 3752    Result = _mm_add_ps(Result, vConstants);
 3753    Result = _mm_mul_ps(Result, x2);
 3754
 3755    vConstants = XM_PERMUTE_PS( CC0,  _MM_SHUFFLE(1, 1, 1, 1) );
 3756    Result = _mm_add_ps(Result, vConstants);
 3757    Result = _mm_mul_ps(Result, x2);
 3758
 3759    vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
 3760    Result = _mm_add_ps(Result, vConstants);
 3761    Result = _mm_mul_ps(Result, x2);
 3762    Result = _mm_add_ps(Result, g_XMOne);
 3763    Result = _mm_mul_ps(Result, sign);
 3764    *pCos = Result;
 3765#else // _XM_VMX128_INTRINSICS_
 3766#endif // _XM_VMX128_INTRINSICS_
 3767}
 3768
 3769//------------------------------------------------------------------------------
 3770
 3771inline XMVECTOR XMVectorTan
 3772(
 3773    FXMVECTOR V
 3774)
 3775{
 3776    // Cody and Waite algorithm to compute tangent.
 3777
 3778#if defined(_XM_NO_INTRINSICS_)
 3779    XMVECTOR Result;
 3780    Result.vector4_f32[0] = tanf( V.vector4_f32[0] );
 3781    Result.vector4_f32[1] = tanf( V.vector4_f32[1] );
 3782    Result.vector4_f32[2] = tanf( V.vector4_f32[2] );
 3783    Result.vector4_f32[3] = tanf( V.vector4_f32[3] );
 3784    return Result;
 3785#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 
 3786
 3787    static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f};
 3788    static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f};
 3789    static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ };
 3790    static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1};
 3791
 3792    XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);
 3793
 3794    XMVECTOR Zero = XMVectorZero();
 3795
 3796    XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
 3797    XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
 3798    XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);
 3799
 3800    XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);
 3801
 3802    VA = XMVectorRound(VA);
 3803
 3804    XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
 3805
 3806    XMVECTOR VB = XMVectorAbs(VA);
 3807
 3808    VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
 3809
 3810#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 3811    VB = vcvtq_u32_f32( VB );
 3812#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 3813    reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB);
 3814#else
 3815    for (size_t i = 0; i < 4; i++)
 3816    {
 3817        VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i];
 3818    }
 3819#endif
 3820
 3821    XMVECTOR VC2 = XMVectorMultiply(VC, VC);
 3822
 3823    XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
 3824    XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
 3825    XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
 3826    XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
 3827    XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
 3828    XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
 3829    XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
 3830    XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);
 3831
 3832    XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
 3833    VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
 3834
 3835    XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
 3836    XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
 3837    N = XMVectorMultiplyAdd(VC2, N, T5);
 3838    D = XMVectorMultiplyAdd(VC2, D, T2);
 3839    N = XMVectorMultiply(VC2, N);
 3840    D = XMVectorMultiplyAdd(VC2, D, T1);
 3841    N = XMVectorMultiplyAdd(VC, N, VC);
 3842    XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
 3843    D = XMVectorMultiplyAdd(VC2, D, T0);
 3844
 3845    N = XMVectorSelect(N, VC, VCNearZero);
 3846    D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
 3847
 3848    XMVECTOR R0 = XMVectorNegate(N);
 3849    XMVECTOR R1 = XMVectorDivide(N,D);
 3850    R0 = XMVectorDivide(D,R0);
 3851
 3852    XMVECTOR VIsZero = XMVectorEqual(V, Zero);
 3853
 3854    XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);
 3855
 3856    Result = XMVectorSelect(Result, Zero, VIsZero);
 3857
 3858    return Result;
 3859
 3860#else // _XM_VMX128_INTRINSICS_
 3861#endif // _XM_VMX128_INTRINSICS_
 3862}
 3863
 3864//------------------------------------------------------------------------------
 3865
 3866inline XMVECTOR XMVectorSinH
 3867(
 3868    FXMVECTOR V
 3869)
 3870{
 3871#if defined(_XM_NO_INTRINSICS_)
 3872    XMVECTOR Result;
 3873    Result.vector4_f32[0] = sinhf( V.vector4_f32[0] );
 3874    Result.vector4_f32[1] = sinhf( V.vector4_f32[1] );
 3875    Result.vector4_f32[2] = sinhf( V.vector4_f32[2] );
 3876    Result.vector4_f32[3] = sinhf( V.vector4_f32[3] );
 3877    return Result;
 3878#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3879    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
 3880
 3881    XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v );
 3882    XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v );
 3883    XMVECTOR E1 = XMVectorExp(V1);
 3884    XMVECTOR E2 = XMVectorExp(V2);
 3885
 3886    return vsubq_f32(E1, E2);
 3887#elif defined(_XM_SSE_INTRINSICS_)
 3888    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
 3889
 3890    XMVECTOR V1 = _mm_mul_ps(V, Scale);
 3891    V1 = _mm_add_ps(V1,g_XMNegativeOne);
 3892    XMVECTOR V2 = _mm_mul_ps(V, Scale);
 3893    V2 = _mm_sub_ps(g_XMNegativeOne,V2);
 3894    XMVECTOR E1 = XMVectorExp(V1);
 3895    XMVECTOR E2 = XMVectorExp(V2);
 3896
 3897    return _mm_sub_ps(E1, E2);
 3898#else // _XM_VMX128_INTRINSICS_
 3899#endif // _XM_VMX128_INTRINSICS_
 3900}
 3901
 3902//------------------------------------------------------------------------------
 3903
 3904inline XMVECTOR XMVectorCosH
 3905(
 3906    FXMVECTOR V
 3907)
 3908{
 3909#if defined(_XM_NO_INTRINSICS_)
 3910    XMVECTOR Result;
 3911    Result.vector4_f32[0] = coshf( V.vector4_f32[0] );
 3912    Result.vector4_f32[1] = coshf( V.vector4_f32[1] );
 3913    Result.vector4_f32[2] = coshf( V.vector4_f32[2] );
 3914    Result.vector4_f32[3] = coshf( V.vector4_f32[3] );
 3915    return Result;
 3916#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3917    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
 3918
 3919    XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
 3920    XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
 3921    XMVECTOR E1 = XMVectorExp(V1);
 3922    XMVECTOR E2 = XMVectorExp(V2);
 3923    return vaddq_f32(E1, E2);
 3924#elif defined(_XM_SSE_INTRINSICS_)
 3925    static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
 3926
 3927    XMVECTOR V1 = _mm_mul_ps(V,Scale.v);
 3928    V1 = _mm_add_ps(V1,g_XMNegativeOne.v);
 3929    XMVECTOR V2 = _mm_mul_ps(V, Scale.v);
 3930    V2 = _mm_sub_ps(g_XMNegativeOne.v,V2);
 3931    XMVECTOR E1 = XMVectorExp(V1);
 3932    XMVECTOR E2 = XMVectorExp(V2);
 3933    return _mm_add_ps(E1, E2);
 3934#else // _XM_VMX128_INTRINSICS_
 3935#endif // _XM_VMX128_INTRINSICS_
 3936}
 3937
 3938//------------------------------------------------------------------------------
 3939
 3940inline XMVECTOR XMVectorTanH
 3941(
 3942    FXMVECTOR V
 3943)
 3944{
 3945#if defined(_XM_NO_INTRINSICS_)
 3946    XMVECTOR Result;
 3947    Result.vector4_f32[0] = tanhf( V.vector4_f32[0] );
 3948    Result.vector4_f32[1] = tanhf( V.vector4_f32[1] );
 3949    Result.vector4_f32[2] = tanhf( V.vector4_f32[2] );
 3950    Result.vector4_f32[3] = tanhf( V.vector4_f32[3] );
 3951    return Result;
 3952#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3953    static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
 3954
 3955    XMVECTOR E = vmulq_f32(V, Scale.v);
 3956    E = XMVectorExp(E);
 3957    E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v );
 3958    E = XMVectorReciprocal(E);
 3959    return vsubq_f32(g_XMOne.v, E);
 3960#elif defined(_XM_SSE_INTRINSICS_)
 3961    static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
 3962
 3963    XMVECTOR E = _mm_mul_ps(V, Scale.v);
 3964    E = XMVectorExp(E);
 3965    E = _mm_mul_ps(E,g_XMOneHalf.v);
 3966    E = _mm_add_ps(E,g_XMOneHalf.v);
 3967    E = _mm_div_ps(g_XMOne.v,E);
 3968    return _mm_sub_ps(g_XMOne.v,E);
 3969#else // _XM_VMX128_INTRINSICS_
 3970#endif // _XM_VMX128_INTRINSICS_
 3971}
 3972
 3973//------------------------------------------------------------------------------
 3974
 3975inline XMVECTOR XMVectorASin
 3976(
 3977    FXMVECTOR V
 3978)
 3979{
 3980    // 7-degree minimax approximation
 3981
 3982#if defined(_XM_NO_INTRINSICS_)
 3983    XMVECTOR Result;
 3984    Result.vector4_f32[0] = XMScalarASin( V.vector4_f32[0] );
 3985    Result.vector4_f32[1] = XMScalarASin( V.vector4_f32[1] );
 3986    Result.vector4_f32[2] = XMScalarASin( V.vector4_f32[2] );
 3987    Result.vector4_f32[3] = XMScalarASin( V.vector4_f32[3] );
 3988    return Result;
 3989#elif defined(_XM_ARM_NEON_INTRINSICS_)
 3990    __n128 nonnegative = vcgeq_f32(V, g_XMZero);
 3991    __n128 x = vabsq_f32(V);
 3992
 3993    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 3994    __n128 oneMValue = vsubq_f32(g_XMOne, x);
 3995    __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
 3996    __n128 root = XMVectorSqrt(clampOneMValue);
 3997
 3998    // Compute polynomial approximation
 3999    const XMVECTOR AC1 = g_XMArcCoefficients1;
 4000    __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1);
 4001
 4002    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
 4003    t0 = vmlaq_f32( vConstants, t0, x );
 4004
 4005    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
 4006    t0 = vmlaq_f32( vConstants, t0, x );
 4007
 4008    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
 4009    t0 = vmlaq_f32( vConstants, t0, x );
 4010
 4011    const XMVECTOR AC0 = g_XMArcCoefficients0;
 4012    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
 4013    t0 = vmlaq_f32( vConstants, t0, x );
 4014
 4015    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
 4016    t0 = vmlaq_f32( vConstants, t0, x );
 4017
 4018    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
 4019    t0 = vmlaq_f32( vConstants, t0, x );
 4020
 4021    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
 4022    t0 = vmlaq_f32( vConstants, t0, x );
 4023    t0 = vmulq_f32(t0, root);
 4024
 4025    __n128 t1 = vsubq_f32(g_XMPi, t0);
 4026    t0 = vbslq_f32( nonnegative, t0, t1 );
 4027    t0 = vsubq_f32(g_XMHalfPi, t0);
 4028    return t0;
 4029#elif defined(_XM_SSE_INTRINSICS_)
 4030    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
 4031    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
 4032    __m128 x = _mm_max_ps(V, mvalue);  // |V|
 4033
 4034    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 4035    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
 4036    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
 4037    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
 4038
 4039    // Compute polynomial approximation
 4040    const XMVECTOR AC1 = g_XMArcCoefficients1;
 4041    XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
 4042    __m128 t0 = _mm_mul_ps(vConstants, x);
 4043
 4044    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
 4045    t0 = _mm_add_ps(t0, vConstants);
 4046    t0 = _mm_mul_ps(t0, x);
 4047
 4048    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
 4049    t0 = _mm_add_ps(t0, vConstants);
 4050    t0 = _mm_mul_ps(t0, x);
 4051
 4052    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
 4053    t0 = _mm_add_ps(t0, vConstants);
 4054    t0 = _mm_mul_ps(t0, x);
 4055
 4056    const XMVECTOR AC0 = g_XMArcCoefficients0;
 4057    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
 4058    t0 = _mm_add_ps(t0, vConstants);
 4059    t0 = _mm_mul_ps(t0, x);
 4060
 4061    vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) );
 4062    t0 = _mm_add_ps(t0, vConstants);
 4063    t0 = _mm_mul_ps(t0, x);
 4064
 4065    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
 4066    t0 = _mm_add_ps(t0, vConstants);
 4067    t0 = _mm_mul_ps(t0, x);
 4068
 4069    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
 4070    t0 = _mm_add_ps(t0, vConstants);
 4071    t0 = _mm_mul_ps(t0, root);
 4072
 4073    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
 4074    t0 = _mm_and_ps(nonnegative, t0);
 4075    t1 = _mm_andnot_ps(nonnegative, t1);
 4076    t0 = _mm_or_ps(t0, t1);
 4077    t0 = _mm_sub_ps(g_XMHalfPi, t0);
 4078    return t0;
 4079#else // _XM_VMX128_INTRINSICS_
 4080#endif // _XM_VMX128_INTRINSICS_
 4081}
 4082
 4083//------------------------------------------------------------------------------
 4084
 4085inline XMVECTOR XMVectorACos
 4086(
 4087    FXMVECTOR V
 4088)
 4089{
 4090    // 7-degree minimax approximation
 4091
 4092#if defined(_XM_NO_INTRINSICS_)
 4093    XMVECTOR Result;
 4094    Result.vector4_f32[0] = XMScalarACos( V.vector4_f32[0] );
 4095    Result.vector4_f32[1] = XMScalarACos( V.vector4_f32[1] );
 4096    Result.vector4_f32[2] = XMScalarACos( V.vector4_f32[2] );
 4097    Result.vector4_f32[3] = XMScalarACos( V.vector4_f32[3] );
 4098    return Result;
 4099#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4100    __n128 nonnegative = vcgeq_f32(V, g_XMZero);
 4101    __n128 x = vabsq_f32(V);
 4102
 4103    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 4104    __n128 oneMValue = vsubq_f32(g_XMOne, x);
 4105    __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
 4106    __n128 root = XMVectorSqrt(clampOneMValue);
 4107
 4108    // Compute polynomial approximation
 4109    const XMVECTOR AC1 = g_XMArcCoefficients1;
 4110    __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1);
 4111
 4112    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
 4113    t0 = vmlaq_f32( vConstants, t0, x );
 4114
 4115    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
 4116    t0 = vmlaq_f32( vConstants, t0, x );
 4117
 4118    vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
 4119    t0 = vmlaq_f32( vConstants, t0, x );
 4120
 4121    const XMVECTOR AC0 = g_XMArcCoefficients0;
 4122    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
 4123    t0 = vmlaq_f32( vConstants, t0, x );
 4124
 4125    vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
 4126    t0 = vmlaq_f32( vConstants, t0, x );
 4127
 4128    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
 4129    t0 = vmlaq_f32( vConstants, t0, x );
 4130
 4131    vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
 4132    t0 = vmlaq_f32( vConstants, t0, x );
 4133    t0 = vmulq_f32(t0, root);
 4134
 4135    __n128 t1 = vsubq_f32(g_XMPi, t0);
 4136    t0 = vbslq_f32( nonnegative, t0, t1 );
 4137    return t0;
 4138#elif defined(_XM_SSE_INTRINSICS_)
 4139    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
 4140    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
 4141    __m128 x = _mm_max_ps(V, mvalue);  // |V|
 4142
 4143    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 4144    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
 4145    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
 4146    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
 4147
 4148    // Compute polynomial approximation
 4149    const XMVECTOR AC1 = g_XMArcCoefficients1;
 4150    XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
 4151    __m128 t0 = _mm_mul_ps(vConstants, x);
 4152
 4153    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
 4154    t0 = _mm_add_ps(t0, vConstants);
 4155    t0 = _mm_mul_ps(t0, x);
 4156
 4157    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
 4158    t0 = _mm_add_ps(t0, vConstants);
 4159    t0 = _mm_mul_ps(t0, x);
 4160
 4161    vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
 4162    t0 = _mm_add_ps(t0, vConstants);
 4163    t0 = _mm_mul_ps(t0, x);
 4164
 4165    const XMVECTOR AC0 = g_XMArcCoefficients0;
 4166    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
 4167    t0 = _mm_add_ps(t0, vConstants);
 4168    t0 = _mm_mul_ps(t0, x);
 4169
 4170    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) );
 4171    t0 = _mm_add_ps(t0, vConstants);
 4172    t0 = _mm_mul_ps(t0, x);
 4173
 4174    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
 4175    t0 = _mm_add_ps(t0, vConstants);
 4176    t0 = _mm_mul_ps(t0, x);
 4177
 4178    vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
 4179    t0 = _mm_add_ps(t0, vConstants);
 4180    t0 = _mm_mul_ps(t0, root);
 4181
 4182    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
 4183    t0 = _mm_and_ps(nonnegative, t0);
 4184    t1 = _mm_andnot_ps(nonnegative, t1);
 4185    t0 = _mm_or_ps(t0, t1);
 4186    return t0;
 4187#else // _XM_VMX128_INTRINSICS_
 4188#endif // _XM_VMX128_INTRINSICS_
 4189}
 4190
 4191//------------------------------------------------------------------------------
 4192
 4193inline XMVECTOR XMVectorATan
 4194(
 4195    FXMVECTOR V
 4196)
 4197{
 4198    // 17-degree minimax approximation
 4199
 4200#if defined(_XM_NO_INTRINSICS_)
 4201    XMVECTOR Result;
 4202    Result.vector4_f32[0] = atanf( V.vector4_f32[0] );
 4203    Result.vector4_f32[1] = atanf( V.vector4_f32[1] );
 4204    Result.vector4_f32[2] = atanf( V.vector4_f32[2] );
 4205    Result.vector4_f32[3] = atanf( V.vector4_f32[3] );
 4206    return Result;
 4207#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4208    __n128 absV = vabsq_f32(V);
 4209    __n128 invV = XMVectorReciprocal(V);
 4210    __n128 comp = vcgtq_f32(V, g_XMOne);
 4211    __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
 4212    comp = vcleq_f32(absV, g_XMOne);
 4213    sign = vbslq_f32(comp, g_XMZero, sign);
 4214    __n128 x = vbslq_f32(comp, V, invV);
 4215
 4216    __n128 x2 = vmulq_f32(x, x);
 4217
 4218    // Compute polynomial approximation
 4219    const XMVECTOR TC1 = g_XMATanCoefficients1;
 4220    __n128 Result = vdupq_lane_f32(vget_high_f32(TC1), 1);
 4221
 4222    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
 4223    Result = vmlaq_f32( vConstants, Result, x2 );
 4224
 4225    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
 4226    Result = vmlaq_f32( vConstants, Result, x2 );
 4227
 4228    vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
 4229    Result = vmlaq_f32( vConstants, Result, x2 );
 4230
 4231    const XMVECTOR TC0 = g_XMATanCoefficients0;
 4232    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
 4233    Result = vmlaq_f32( vConstants, Result, x2 );
 4234
 4235    vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
 4236    Result = vmlaq_f32( vConstants, Result, x2 );
 4237
 4238    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
 4239    Result = vmlaq_f32( vConstants, Result, x2 );
 4240
 4241    vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
 4242    Result = vmlaq_f32( vConstants, Result, x2 );
 4243
 4244    Result = vmlaq_f32( g_XMOne, Result, x2 );
 4245    Result = vmulq_f32( Result, x );
 4246
 4247    __n128 result1 = vmulq_f32(sign, g_XMHalfPi);
 4248    result1 = vsubq_f32(result1, Result);
 4249
 4250    comp = vceqq_f32(sign, g_XMZero);
 4251    Result = vbslq_f32( comp, Result, result1 );
 4252    return Result;
 4253#elif defined(_XM_SSE_INTRINSICS_)
 4254    __m128 absV = XMVectorAbs(V);
 4255    __m128 invV = _mm_div_ps(g_XMOne, V);
 4256    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
 4257    __m128 select0 = _mm_and_ps(comp, g_XMOne);
 4258    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
 4259    __m128 sign = _mm_or_ps(select0, select1);
 4260    comp = _mm_cmple_ps(absV, g_XMOne);
 4261    select0 = _mm_and_ps(comp, g_XMZero);
 4262    select1 = _mm_andnot_ps(comp, sign);
 4263    sign = _mm_or_ps(select0, select1);
 4264    select0 = _mm_and_ps(comp, V);
 4265    select1 = _mm_andnot_ps(comp, invV);
 4266    __m128 x = _mm_or_ps(select0, select1);
 4267
 4268    __m128 x2 = _mm_mul_ps(x, x);
 4269
 4270    // Compute polynomial approximation
 4271    const XMVECTOR TC1 = g_XMATanCoefficients1;
 4272    XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) );
 4273    __m128 Result = _mm_mul_ps(vConstants, x2);
 4274
 4275    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) );
 4276    Result = _mm_add_ps(Result, vConstants);
 4277    Result = _mm_mul_ps(Result, x2);
 4278
 4279    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) );
 4280    Result = _mm_add_ps(Result, vConstants);
 4281    Result = _mm_mul_ps(Result, x2);
 4282
 4283    vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) );
 4284    Result = _mm_add_ps(Result, vConstants);
 4285    Result = _mm_mul_ps(Result, x2);
 4286
 4287    const XMVECTOR TC0 = g_XMATanCoefficients0;
 4288    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) );
 4289    Result = _mm_add_ps(Result, vConstants);
 4290    Result = _mm_mul_ps(Result, x2);
 4291
 4292    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) );
 4293    Result = _mm_add_ps(Result, vConstants);
 4294    Result = _mm_mul_ps(Result, x2);
 4295
 4296    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) );
 4297    Result = _mm_add_ps(Result, vConstants);
 4298    Result = _mm_mul_ps(Result, x2);
 4299
 4300    vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) );
 4301    Result = _mm_add_ps(Result, vConstants);
 4302    Result = _mm_mul_ps(Result, x2);
 4303    Result = _mm_add_ps(Result, g_XMOne);
 4304    Result = _mm_mul_ps(Result, x);
 4305    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
 4306    result1 = _mm_sub_ps(result1, Result);
 4307
 4308    comp = _mm_cmpeq_ps(sign, g_XMZero);
 4309    select0 = _mm_and_ps(comp, Result);
 4310    select1 = _mm_andnot_ps(comp, result1);
 4311    Result = _mm_or_ps(select0, select1);
 4312    return Result;
 4313#else // _XM_VMX128_INTRINSICS_
 4314#endif // _XM_VMX128_INTRINSICS_
 4315}
 4316
 4317//------------------------------------------------------------------------------
 4318
 4319inline XMVECTOR XMVectorATan2
 4320(
 4321    FXMVECTOR Y, 
 4322    FXMVECTOR X
 4323)
 4324{
 4325    // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
 4326
 4327    //     Y == 0 and X is Negative         -> Pi with the sign of Y
 4328    //     y == 0 and x is positive         -> 0 with the sign of y
 4329    //     Y != 0 and X == 0                -> Pi / 2 with the sign of Y
 4330    //     Y != 0 and X is Negative         -> atan(y/x) + (PI with the sign of Y)
 4331    //     X == -Infinity and Finite Y      -> Pi with the sign of Y
 4332    //     X == +Infinity and Finite Y      -> 0 with the sign of Y
 4333    //     Y == Infinity and X is Finite    -> Pi / 2 with the sign of Y
 4334    //     Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
 4335    //     Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
 4336
 4337    static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
 4338
 4339    XMVECTOR Zero = XMVectorZero();
 4340    XMVECTOR ATanResultValid = XMVectorTrueInt();
 4341
 4342    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
 4343    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
 4344    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
 4345    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
 4346
 4347    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
 4348    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
 4349    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
 4350    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
 4351    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
 4352    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
 4353
 4354    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
 4355    Pi = XMVectorOrInt(Pi, YSign);
 4356    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
 4357    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
 4358    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
 4359
 4360    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
 4361    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
 4362    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
 4363    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
 4364    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
 4365    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
 4366    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
 4367
 4368    XMVECTOR V = XMVectorDivide(Y, X);
 4369
 4370    XMVECTOR R0 = XMVectorATan(V);
 4371
 4372    R1 = XMVectorSelect( Pi, Zero, XIsPositive );
 4373    R2 = XMVectorAdd(R0, R1);
 4374
 4375    return XMVectorSelect(Result, R2, ATanResultValid);
 4376}
 4377
 4378//------------------------------------------------------------------------------
 4379
 4380inline XMVECTOR XMVectorSinEst
 4381(
 4382    FXMVECTOR V
 4383)
 4384{
 4385    // 7-degree minimax approximation
 4386
 4387#if defined(_XM_NO_INTRINSICS_)
 4388    XMVECTOR Result;
 4389    Result.vector4_f32[0] = XMScalarSinEst( V.vector4_f32[0] );
 4390    Result.vector4_f32[1] = XMScalarSinEst( V.vector4_f32[1] );
 4391    Result.vector4_f32[2] = XMScalarSinEst( V.vector4_f32[2] );
 4392    Result.vector4_f32[3] = XMScalarSinEst( V.vector4_f32[3] );
 4393    return Result;
 4394#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4395    // Force the value within the bounds of pi
 4396    XMVECTOR x = XMVectorModAngles(V);
 4397
 4398    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
 4399    __n128 sign = vandq_u32(x, g_XMNegativeZero);
 4400    __n128 c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 4401    __n128 absx = vabsq_f32( x );
 4402    __n128 rflx = vsubq_f32(c, x);
 4403    __n128 comp = vcleq_f32(absx, g_XMHalfPi);
 4404    x = vbslq_f32( comp, x, rflx );
 4405
 4406    __n128 x2 = vmulq_f32(x, x);
 4407
 4408    // Compute polynomial approximation
 4409    const XMVECTOR SEC = g_XMSinCoefficients1;
 4410    XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1);
 4411
 4412    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
 4413    Result = vmlaq_f32(vConstants, Result, x2);
 4414
 4415    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
 4416    Result = vmlaq_f32(vConstants, Result, x2);
 4417
 4418    Result = vmlaq_f32(g_XMOne, Result, x2);
 4419    Result = vmulq_f32(Result, x);
 4420    return Result;
 4421#elif defined(_XM_SSE_INTRINSICS_)
 4422    // Force the value within the bounds of pi
 4423    XMVECTOR x = XMVectorModAngles(V);
 4424
 4425    // Map in [-pi/2,pi/2] with sin(y) = sin(x).
 4426    __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
 4427    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 4428    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
 4429    __m128 rflx = _mm_sub_ps(c, x);
 4430    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
 4431    __m128 select0 = _mm_and_ps(comp, x);
 4432    __m128 select1 = _mm_andnot_ps(comp, rflx);
 4433    x = _mm_or_ps(select0, select1);
 4434
 4435    __m128 x2 = _mm_mul_ps(x, x);
 4436
 4437    // Compute polynomial approximation
 4438    const XMVECTOR SEC = g_XMSinCoefficients1;
 4439    XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
 4440    __m128 Result = _mm_mul_ps(vConstants, x2);
 4441
 4442    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
 4443    Result = _mm_add_ps(Result, vConstants);
 4444    Result = _mm_mul_ps(Result, x2);
 4445
 4446    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
 4447    Result = _mm_add_ps(Result, vConstants);
 4448    Result = _mm_mul_ps(Result, x2);
 4449
 4450    Result = _mm_add_ps(Result, g_XMOne);
 4451    Result = _mm_mul_ps(Result, x);
 4452    return Result;
 4453#else // _XM_VMX128_INTRINSICS_
 4454#endif // _XM_VMX128_INTRINSICS_
 4455}
 4456
 4457//------------------------------------------------------------------------------
 4458
 4459inline XMVECTOR XMVectorCosEst
 4460(
 4461    FXMVECTOR V
 4462)
 4463{
 4464    // 6-degree minimax approximation
 4465
 4466#if defined(_XM_NO_INTRINSICS_)
 4467    XMVECTOR Result;
 4468    Result.vector4_f32[0] = XMScalarCosEst( V.vector4_f32[0] );
 4469    Result.vector4_f32[1] = XMScalarCosEst( V.vector4_f32[1] );
 4470    Result.vector4_f32[2] = XMScalarCosEst( V.vector4_f32[2] );
 4471    Result.vector4_f32[3] = XMScalarCosEst( V.vector4_f32[3] );
 4472    return Result;
 4473#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4474    // Map V to x in [-pi,pi].
 4475    XMVECTOR x = XMVectorModAngles(V);
 4476
 4477    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
 4478    __n128 sign = vandq_u32(x, g_XMNegativeZero);
 4479    __n128 c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 4480    __n128 absx = vabsq_f32( x );
 4481    __n128 rflx = vsubq_f32(c, x);
 4482    __n128 comp = vcleq_f32(absx, g_XMHalfPi);
 4483    x = vbslq_f32( comp, x, rflx );
 4484    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
 4485
 4486    __n128 x2 = vmulq_f32(x, x);
 4487
 4488    // Compute polynomial approximation
 4489    const XMVECTOR CEC = g_XMCosCoefficients1;
 4490    XMVECTOR Result = vdupq_lane_f32(vget_high_f32(CEC), 1);
 4491
 4492    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
 4493    Result = vmlaq_f32(vConstants, Result, x2);
 4494
 4495    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
 4496    Result = vmlaq_f32(vConstants, Result, x2);
 4497
 4498    Result = vmlaq_f32(g_XMOne, Result, x2);
 4499    Result = vmulq_f32(Result, sign);
 4500    return Result;
 4501#elif defined(_XM_SSE_INTRINSICS_)
 4502    // Map V to x in [-pi,pi].
 4503    XMVECTOR x = XMVectorModAngles(V);
 4504
 4505    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
 4506    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
 4507    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 4508    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
 4509    __m128 rflx = _mm_sub_ps(c, x);
 4510    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
 4511    __m128 select0 = _mm_and_ps(comp, x);
 4512    __m128 select1 = _mm_andnot_ps(comp, rflx);
 4513    x = _mm_or_ps(select0, select1);
 4514    select0 = _mm_and_ps(comp, g_XMOne);
 4515    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
 4516    sign = _mm_or_ps(select0, select1);
 4517
 4518    __m128 x2 = _mm_mul_ps(x, x);
 4519
 4520    // Compute polynomial approximation
 4521    const XMVECTOR CEC = g_XMCosCoefficients1;
 4522    XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
 4523    __m128 Result = _mm_mul_ps(vConstants, x2);
 4524
 4525    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
 4526    Result = _mm_add_ps(Result, vConstants);
 4527    Result = _mm_mul_ps(Result, x2);
 4528
 4529    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
 4530    Result = _mm_add_ps(Result, vConstants);
 4531    Result = _mm_mul_ps(Result, x2);
 4532
 4533    Result = _mm_add_ps(Result, g_XMOne);
 4534    Result = _mm_mul_ps(Result, sign);
 4535    return Result;
 4536#else // _XM_VMX128_INTRINSICS_
 4537#endif // _XM_VMX128_INTRINSICS_
 4538}
 4539
 4540//------------------------------------------------------------------------------
 4541
 4542_Use_decl_annotations_
 4543inline void XMVectorSinCosEst
 4544(
 4545    XMVECTOR* pSin, 
 4546    XMVECTOR* pCos, 
 4547    FXMVECTOR  V
 4548)
 4549{
 4550    assert(pSin != NULL);
 4551    assert(pCos != NULL);
 4552
 4553    // 7/6-degree minimax approximation
 4554
 4555#if defined(_XM_NO_INTRINSICS_)
 4556    XMVECTOR Sin;
 4557    XMVECTOR Cos;
 4558
 4559    XMScalarSinCosEst(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]);
 4560    XMScalarSinCosEst(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]);
 4561    XMScalarSinCosEst(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]);
 4562    XMScalarSinCosEst(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]);
 4563
 4564    *pSin = Sin;
 4565    *pCos = Cos;
 4566#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4567    // Force the value within the bounds of pi
 4568    XMVECTOR x = XMVectorModAngles(V);
 4569
 4570    // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
 4571    __n128 sign = vandq_u32(x, g_XMNegativeZero);
 4572    __n128 c = vorrq_u32(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 4573    __n128 absx = vabsq_f32( x );
 4574    __n128 rflx = vsubq_f32(c, x);
 4575    __n128 comp = vcleq_f32(absx, g_XMHalfPi);
 4576    x = vbslq_f32( comp, x, rflx );
 4577    sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
 4578
 4579    __n128 x2 = vmulq_f32(x, x);
 4580
 4581    // Compute polynomial approximation for sine
 4582    const XMVECTOR SEC = g_XMSinCoefficients1;
 4583    XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1);
 4584
 4585    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
 4586    Result = vmlaq_f32(vConstants, Result, x2);
 4587
 4588    vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
 4589    Result = vmlaq_f32(vConstants, Result, x2);
 4590
 4591    Result = vmlaq_f32(g_XMOne, Result, x2);
 4592    *pSin = vmulq_f32(Result, x);
 4593
 4594    // Compute polynomial approximation
 4595    const XMVECTOR CEC = g_XMCosCoefficients1;
 4596    Result = vdupq_lane_f32(vget_high_f32(CEC), 1);
 4597
 4598    vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
 4599    Result = vmlaq_f32(vConstants, Result, x2);
 4600
 4601    vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
 4602    Result = vmlaq_f32(vConstants, Result, x2);
 4603
 4604    Result = vmlaq_f32(g_XMOne, Result, x2);
 4605    *pCos = vmulq_f32(Result, sign);
 4606#elif defined(_XM_SSE_INTRINSICS_)
 4607    // Force the value within the bounds of pi
 4608    XMVECTOR x = XMVectorModAngles(V);
 4609
 4610    // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
 4611    XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
 4612    __m128 c = _mm_or_ps(g_XMPi, sign);  // pi when x >= 0, -pi when x < 0
 4613    __m128 absx = _mm_andnot_ps(sign, x);  // |x|
 4614    __m128 rflx = _mm_sub_ps(c, x);
 4615    __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
 4616    __m128 select0 = _mm_and_ps(comp, x);
 4617    __m128 select1 = _mm_andnot_ps(comp, rflx);
 4618    x = _mm_or_ps(select0, select1);
 4619    select0 = _mm_and_ps(comp, g_XMOne);
 4620    select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
 4621    sign = _mm_or_ps(select0, select1);
 4622
 4623    __m128 x2 = _mm_mul_ps(x, x);
 4624
 4625    // Compute polynomial approximation for sine
 4626    const XMVECTOR SEC = g_XMSinCoefficients1;
 4627    XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
 4628    __m128 Result = _mm_mul_ps(vConstants, x2);
 4629
 4630    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
 4631    Result = _mm_add_ps(Result, vConstants);
 4632    Result = _mm_mul_ps(Result, x2);
 4633
 4634    vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
 4635    Result = _mm_add_ps(Result, vConstants);
 4636    Result = _mm_mul_ps(Result, x2);
 4637
 4638    Result = _mm_add_ps(Result, g_XMOne);
 4639    Result = _mm_mul_ps(Result, x);
 4640    *pSin = Result;
 4641
 4642    // Compute polynomial approximation for cosine
 4643    const XMVECTOR CEC = g_XMCosCoefficients1;
 4644    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
 4645    Result = _mm_mul_ps(vConstants, x2);
 4646
 4647    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
 4648    Result = _mm_add_ps(Result, vConstants);
 4649    Result = _mm_mul_ps(Result, x2);
 4650
 4651    vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
 4652    Result = _mm_add_ps(Result, vConstants);
 4653    Result = _mm_mul_ps(Result, x2);
 4654
 4655    Result = _mm_add_ps(Result, g_XMOne);
 4656    Result = _mm_mul_ps(Result, sign);
 4657    *pCos = Result;
 4658#else // _XM_VMX128_INTRINSICS_
 4659#endif // _XM_VMX128_INTRINSICS_
 4660}
 4661
 4662//------------------------------------------------------------------------------
 4663
 4664inline XMVECTOR XMVectorTanEst
 4665(
 4666    FXMVECTOR V
 4667)
 4668{
 4669    XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
 4670
 4671    XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
 4672    V1 = XMVectorRound(V1);
 4673
 4674    V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
 4675
 4676    XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
 4677    XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
 4678    XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
 4679
 4680    XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
 4681    XMVECTOR V2 = XMVectorMultiply(V1, V1);
 4682    XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
 4683    XMVECTOR V1T1 = XMVectorMultiply(V1, T1);
 4684
 4685    XMVECTOR D = XMVectorReciprocalEst(V2T2);
 4686    XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
 4687
 4688    return XMVectorMultiply(N, D);
 4689}
 4690
 4691
 4692//------------------------------------------------------------------------------
 4693
 4694inline XMVECTOR XMVectorASinEst
 4695(
 4696    FXMVECTOR V
 4697)
 4698{
 4699    // 3-degree minimax approximation
 4700
 4701#if defined(_XM_NO_INTRINSICS_)
 4702    XMVECTOR Result;
 4703    Result.vector4_f32[0] = XMScalarASinEst( V.vector4_f32[0] );
 4704    Result.vector4_f32[1] = XMScalarASinEst( V.vector4_f32[1] );
 4705    Result.vector4_f32[2] = XMScalarASinEst( V.vector4_f32[2] );
 4706    Result.vector4_f32[3] = XMScalarASinEst( V.vector4_f32[3] );
 4707    return Result;
 4708#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4709    __n128 nonnegative = vcgeq_f32(V, g_XMZero);
 4710    __n128 x = vabsq_f32(V);
 4711
 4712    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 4713    __n128 oneMValue = vsubq_f32(g_XMOne, x);
 4714    __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
 4715    __n128 root = XMVectorSqrt(clampOneMValue);
 4716
 4717    // Compute polynomial approximation
 4718    const XMVECTOR AEC = g_XMArcEstCoefficients;
 4719    __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1);
 4720
 4721    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
 4722    t0 = vmlaq_f32( vConstants, t0, x );
 4723
 4724    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
 4725    t0 = vmlaq_f32( vConstants, t0, x );
 4726
 4727    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
 4728    t0 = vmlaq_f32( vConstants, t0, x );
 4729    t0 = vmulq_f32(t0, root);
 4730
 4731    __n128 t1 = vsubq_f32(g_XMPi, t0);
 4732    t0 = vbslq_f32( nonnegative, t0, t1 );
 4733    t0 = vsubq_f32(g_XMHalfPi, t0);
 4734    return t0;
 4735#elif defined(_XM_SSE_INTRINSICS_)
 4736    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
 4737    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
 4738    __m128 x = _mm_max_ps(V, mvalue);  // |V|
 4739
 4740    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 4741    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
 4742    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
 4743    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
 4744
 4745    // Compute polynomial approximation
 4746    const XMVECTOR AEC = g_XMArcEstCoefficients;
 4747    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
 4748    __m128 t0 = _mm_mul_ps(vConstants, x);
 4749
 4750    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
 4751    t0 = _mm_add_ps(t0, vConstants);
 4752    t0 = _mm_mul_ps(t0, x);
 4753
 4754    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
 4755    t0 = _mm_add_ps(t0, vConstants);
 4756    t0 = _mm_mul_ps(t0, x);
 4757
 4758    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
 4759    t0 = _mm_add_ps(t0, vConstants);
 4760    t0 = _mm_mul_ps(t0, root);
 4761
 4762    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
 4763    t0 = _mm_and_ps(nonnegative, t0);
 4764    t1 = _mm_andnot_ps(nonnegative, t1);
 4765    t0 = _mm_or_ps(t0, t1);
 4766    t0 = _mm_sub_ps(g_XMHalfPi, t0);
 4767    return t0;
 4768#else // _XM_VMX128_INTRINSICS_
 4769#endif // _XM_VMX128_INTRINSICS_
 4770}
 4771
 4772//------------------------------------------------------------------------------
 4773
 4774inline XMVECTOR XMVectorACosEst
 4775(
 4776    FXMVECTOR V
 4777)
 4778{
 4779    // 3-degree minimax approximation
 4780
 4781#if defined(_XM_NO_INTRINSICS_)
 4782    XMVECTOR Result;
 4783    Result.vector4_f32[0] = XMScalarACosEst( V.vector4_f32[0] );
 4784    Result.vector4_f32[1] = XMScalarACosEst( V.vector4_f32[1] );
 4785    Result.vector4_f32[2] = XMScalarACosEst( V.vector4_f32[2] );
 4786    Result.vector4_f32[3] = XMScalarACosEst( V.vector4_f32[3] );
 4787    return Result;
 4788#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4789    __n128 nonnegative = vcgeq_f32(V, g_XMZero);
 4790    __n128 x = vabsq_f32(V);
 4791
 4792    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 4793    __n128 oneMValue = vsubq_f32(g_XMOne, x);
 4794    __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
 4795    __n128 root = XMVectorSqrt(clampOneMValue);
 4796
 4797    // Compute polynomial approximation
 4798    const XMVECTOR AEC = g_XMArcEstCoefficients;
 4799    __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1);
 4800
 4801    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
 4802    t0 = vmlaq_f32( vConstants, t0, x );
 4803
 4804    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
 4805    t0 = vmlaq_f32( vConstants, t0, x );
 4806
 4807    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
 4808    t0 = vmlaq_f32( vConstants, t0, x );
 4809    t0 = vmulq_f32(t0, root);
 4810
 4811    __n128 t1 = vsubq_f32(g_XMPi, t0);
 4812    t0 = vbslq_f32( nonnegative, t0, t1 );
 4813    return t0;
 4814#elif defined(_XM_SSE_INTRINSICS_)
 4815    __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
 4816    __m128 mvalue = _mm_sub_ps(g_XMZero, V);
 4817    __m128 x = _mm_max_ps(V, mvalue);  // |V|
 4818
 4819    // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
 4820    __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
 4821    __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
 4822    __m128 root = _mm_sqrt_ps(clampOneMValue);  // sqrt(1-|V|)
 4823
 4824    // Compute polynomial approximation
 4825    const XMVECTOR AEC = g_XMArcEstCoefficients;
 4826    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
 4827    __m128 t0 = _mm_mul_ps(vConstants, x);
 4828
 4829    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
 4830    t0 = _mm_add_ps(t0, vConstants);
 4831    t0 = _mm_mul_ps(t0, x);
 4832
 4833    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
 4834    t0 = _mm_add_ps(t0, vConstants);
 4835    t0 = _mm_mul_ps(t0, x);
 4836
 4837    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
 4838    t0 = _mm_add_ps(t0, vConstants);
 4839    t0 = _mm_mul_ps(t0, root);
 4840
 4841    __m128 t1 = _mm_sub_ps(g_XMPi, t0);
 4842    t0 = _mm_and_ps(nonnegative, t0);
 4843    t1 = _mm_andnot_ps(nonnegative, t1);
 4844    t0 = _mm_or_ps(t0, t1);
 4845    return t0;
 4846#else // _XM_VMX128_INTRINSICS_
 4847#endif // _XM_VMX128_INTRINSICS_
 4848}
 4849
 4850//------------------------------------------------------------------------------
 4851
 4852namespace Internal
 4853{
 4854
 4855inline float XMScalarATanEst
 4856(
 4857    float Value
 4858)
 4859{
 4860    float y, sign;
 4861    if (fabsf(Value) <= 1.0f)
 4862    {
 4863        y = Value;
 4864        sign = 0.0f;
 4865    }
 4866    else if (Value > 1.0f)
 4867    {
 4868        y = 1.0f / Value;
 4869        sign = 1.0f;
 4870    }
 4871    else
 4872    {
 4873        y = 1.0f / Value;
 4874        sign = -1.0f;
 4875    }
 4876
 4877    // 9-degree minimax approximation
 4878    float y2 = y*y;
 4879    float poly = ((((0.0208351f*y2-0.085133f)*y2+0.180141f)*y2-0.3302995f)*y2+0.999866f)*y;
 4880
 4881    return (sign == 0.0f ? poly : sign*XM_PIDIV2 - poly);
 4882}
 4883
 4884};  // namespace Internal
 4885
 4886//------------------------------------------------------------------------------
 4887
 4888inline XMVECTOR XMVectorATanEst
 4889(
 4890    FXMVECTOR V
 4891)
 4892{
 4893    // 9-degree minimax approximation
 4894
 4895#if defined(_XM_NO_INTRINSICS_)
 4896    XMVECTOR Result;
 4897    Result.vector4_f32[0] = Internal::XMScalarATanEst( V.vector4_f32[0] );
 4898    Result.vector4_f32[1] = Internal::XMScalarATanEst( V.vector4_f32[1] );
 4899    Result.vector4_f32[2] = Internal::XMScalarATanEst( V.vector4_f32[2] );
 4900    Result.vector4_f32[3] = Internal::XMScalarATanEst( V.vector4_f32[3] );
 4901    return Result;
 4902#elif defined(_XM_ARM_NEON_INTRINSICS_)
 4903    __n128 absV = vabsq_f32(V);
 4904    __n128 invV = XMVectorReciprocalEst(V);
 4905    __n128 comp = vcgtq_f32(V, g_XMOne);
 4906    __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne );
 4907    comp = vcleq_f32(absV, g_XMOne);
 4908    sign = vbslq_f32(comp, g_XMZero, sign );
 4909    __n128 x = vbslq_f32(comp, V, invV );
 4910
 4911    __n128 x2 = vmulq_f32(x, x);
 4912
 4913    // Compute polynomial approximation
 4914    const XMVECTOR AEC = g_XMATanEstCoefficients1;
 4915    __n128 Result = vdupq_lane_f32(vget_high_f32(AEC), 1);
 4916
 4917    XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
 4918    Result = vmlaq_f32( vConstants, Result, x2 );
 4919
 4920    vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
 4921    Result = vmlaq_f32( vConstants, Result, x2 );
 4922
 4923    vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0);
 4924    Result = vmlaq_f32( vConstants, Result, x2 );
 4925
 4926    // ATanEstCoefficients0 is already splatted
 4927    Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 );
 4928    Result = vmulq_f32( Result, x );
 4929
 4930    float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
 4931    result1 = vsubq_f32(result1, Result);
 4932
 4933    comp = vceqq_f32(sign, g_XMZero);
 4934    Result = vbslq_f32( comp, Result, result1 );
 4935    return Result;
 4936#elif defined(_XM_SSE_INTRINSICS_)
 4937    __m128 absV = XMVectorAbs(V);
 4938    __m128 invV = _mm_div_ps(g_XMOne, V);
 4939    __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
 4940    __m128 select0 = _mm_and_ps(comp, g_XMOne);
 4941    __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
 4942    __m128 sign = _mm_or_ps(select0, select1);
 4943    comp = _mm_cmple_ps(absV, g_XMOne);
 4944    select0 = _mm_and_ps(comp, g_XMZero);
 4945    select1 = _mm_andnot_ps(comp, sign);
 4946    sign = _mm_or_ps(select0, select1);
 4947    select0 = _mm_and_ps(comp, V);
 4948    select1 = _mm_andnot_ps(comp, invV);
 4949    __m128 x = _mm_or_ps(select0, select1);
 4950
 4951    __m128 x2 = _mm_mul_ps(x, x);
 4952
 4953    // Compute polynomial approximation
 4954    const XMVECTOR AEC = g_XMATanEstCoefficients1;
 4955    XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
 4956    __m128 Result = _mm_mul_ps(vConstants, x2);
 4957
 4958    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
 4959    Result = _mm_add_ps(Result, vConstants);
 4960    Result = _mm_mul_ps(Result, x2);
 4961
 4962    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
 4963    Result = _mm_add_ps(Result, vConstants);
 4964    Result = _mm_mul_ps(Result, x2);
 4965
 4966    vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
 4967    Result = _mm_add_ps(Result, vConstants);
 4968    Result = _mm_mul_ps(Result, x2);
 4969
 4970    // ATanEstCoefficients0 is already splatted
 4971    Result = _mm_add_ps(Result, g_XMATanEstCoefficients0);
 4972    Result = _mm_mul_ps(Result, x);
 4973    __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
 4974    result1 = _mm_sub_ps(result1, Result);
 4975
 4976    comp = _mm_cmpeq_ps(sign, g_XMZero);
 4977    select0 = _mm_and_ps(comp, Result);
 4978    select1 = _mm_andnot_ps(comp, result1);
 4979    Result = _mm_or_ps(select0, select1);
 4980    return Result;
 4981#else // _XM_VMX128_INTRINSICS_
 4982#endif // _XM_VMX128_INTRINSICS_
 4983}
 4984
 4985//------------------------------------------------------------------------------
 4986
 4987inline XMVECTOR XMVectorATan2Est
 4988(
 4989    FXMVECTOR Y, 
 4990    FXMVECTOR X
 4991)
 4992{
 4993    static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */};
 4994
 4995    const XMVECTOR Zero = XMVectorZero();
 4996    XMVECTOR ATanResultValid = XMVectorTrueInt();
 4997
 4998    XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
 4999    XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
 5000    XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
 5001    XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
 5002
 5003    XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
 5004    XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
 5005    XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
 5006    XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
 5007    XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
 5008    XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
 5009
 5010    XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
 5011    Pi = XMVectorOrInt(Pi, YSign);
 5012    PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
 5013    PiOverFour = XMVectorOrInt(PiOverFour, YSign);
 5014    ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
 5015
 5016    XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
 5017    XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
 5018    XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
 5019    XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
 5020    XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
 5021    XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
 5022    ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
 5023
 5024    XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
 5025    XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
 5026    XMVECTOR R0 = XMVectorATanEst(V);
 5027
 5028    R1 = XMVectorSelect( Pi, Zero, XIsPositive );
 5029    R2 = XMVectorAdd(R0, R1);
 5030
 5031    Result = XMVectorSelect(Result, R2, ATanResultValid);
 5032
 5033    return Result;
 5034}
 5035
 5036//------------------------------------------------------------------------------
 5037
 5038inline XMVECTOR XMVectorLerp
 5039(
 5040    FXMVECTOR V0, 
 5041    FXMVECTOR V1, 
 5042    float    t
 5043)
 5044{
 5045    // V0 + t * (V1 - V0)
 5046
 5047#if defined(_XM_NO_INTRINSICS_)
 5048
 5049    XMVECTOR Scale = XMVectorReplicate(t);
 5050    XMVECTOR Length = XMVectorSubtract(V1, V0);
 5051    return XMVectorMultiplyAdd(Length, Scale, V0);
 5052
 5053#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5054    XMVECTOR L = vsubq_f32( V1, V0 );
 5055    return vmlaq_n_f32( V0, L, t );
 5056#elif defined(_XM_SSE_INTRINSICS_)
 5057    XMVECTOR L = _mm_sub_ps( V1, V0 );
 5058    XMVECTOR S = _mm_set_ps1( t );
 5059    XMVECTOR Result = _mm_mul_ps( L, S );
 5060    return _mm_add_ps( Result, V0 );
 5061#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 5062#endif // _XM_VMX128_INTRINSICS_
 5063}
 5064
 5065//------------------------------------------------------------------------------
 5066
 5067inline XMVECTOR XMVectorLerpV
 5068(
 5069    FXMVECTOR V0, 
 5070    FXMVECTOR V1, 
 5071    FXMVECTOR T
 5072)
 5073{
 5074    // V0 + T * (V1 - V0)
 5075
 5076#if defined(_XM_NO_INTRINSICS_)
 5077
 5078    XMVECTOR Length = XMVectorSubtract(V1, V0);
 5079    return XMVectorMultiplyAdd(Length, T, V0);
 5080
 5081#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5082    XMVECTOR L = vsubq_f32( V1, V0 );
 5083    return vmlaq_f32( V0, L, T );
 5084#elif defined(_XM_SSE_INTRINSICS_)
 5085    XMVECTOR Length = _mm_sub_ps( V1, V0 );
 5086    XMVECTOR Result = _mm_mul_ps( Length, T );
 5087    return _mm_add_ps( Result, V0 );
 5088#else // _XM_VMX128_INTRINSICS_
 5089#endif // _XM_VMX128_INTRINSICS_
 5090}
 5091
 5092//------------------------------------------------------------------------------
 5093
 5094inline XMVECTOR XMVectorHermite
 5095(
 5096    FXMVECTOR Position0, 
 5097    FXMVECTOR Tangent0, 
 5098    FXMVECTOR Position1, 
 5099    GXMVECTOR Tangent1, 
 5100    float    t
 5101)
 5102{
 5103    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
 5104    //          (t^3 - 2 * t^2 + t) * Tangent0 +
 5105    //          (-2 * t^3 + 3 * t^2) * Position1 +
 5106    //          (t^3 - t^2) * Tangent1
 5107
 5108#if defined(_XM_NO_INTRINSICS_)
 5109
 5110    float t2 = t * t;
 5111    float t3 = t * t2;
 5112
 5113    XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
 5114    XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
 5115    XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
 5116    XMVECTOR T1 = XMVectorReplicate(t3 - t2);
 5117
 5118    XMVECTOR Result = XMVectorMultiply(P0, Position0);
 5119    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
 5120    Result = XMVectorMultiplyAdd(P1, Position1, Result);
 5121    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
 5122
 5123    return Result;
 5124
 5125#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5126    float t2 = t * t;
 5127    float t3 = t * t2;
 5128
 5129    XMVECTOR P0 = vdupq_n_f32(2.0f * t3 - 3.0f * t2 + 1.0f);
 5130    XMVECTOR T0 = vdupq_n_f32(t3 - 2.0f * t2 + t);
 5131    XMVECTOR P1 = vdupq_n_f32(-2.0f * t3 + 3.0f * t2);
 5132    XMVECTOR T1 = vdupq_n_f32(t3 - t2);
 5133
 5134    XMVECTOR vResult = vmulq_f32(P0, Position0);
 5135    vResult = vmlaq_f32( vResult, T0, Tangent0 );
 5136    vResult = vmlaq_f32( vResult, P1, Position1 );
 5137    vResult = vmlaq_f32( vResult, T1, Tangent1 );
 5138    return vResult;
 5139#elif defined(_XM_SSE_INTRINSICS_)
 5140    float t2 = t * t;
 5141    float t3 = t * t2;
 5142
 5143    XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
 5144    XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
 5145    XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
 5146    XMVECTOR T1 = _mm_set_ps1(t3 - t2);
 5147
 5148    XMVECTOR vResult = _mm_mul_ps(P0, Position0);
 5149    XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0);
 5150    vResult = _mm_add_ps(vResult,vTemp);
 5151    vTemp = _mm_mul_ps(P1, Position1);
 5152    vResult = _mm_add_ps(vResult,vTemp);
 5153    vTemp = _mm_mul_ps(T1, Tangent1);
 5154    vResult = _mm_add_ps(vResult,vTemp);
 5155    return vResult;
 5156#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 5157#endif // _XM_VMX128_INTRINSICS_
 5158}
 5159
 5160//------------------------------------------------------------------------------
 5161
 5162inline XMVECTOR XMVectorHermiteV
 5163(
 5164    FXMVECTOR Position0, 
 5165    FXMVECTOR Tangent0, 
 5166    FXMVECTOR Position1, 
 5167    GXMVECTOR Tangent1, 
 5168    CXMVECTOR T
 5169)
 5170{
 5171    // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
 5172    //          (t^3 - 2 * t^2 + t) * Tangent0 +
 5173    //          (-2 * t^3 + 3 * t^2) * Position1 +
 5174    //          (t^3 - t^2) * Tangent1
 5175
 5176#if defined(_XM_NO_INTRINSICS_)
 5177
 5178    XMVECTOR T2 = XMVectorMultiply(T, T);
 5179    XMVECTOR T3 = XMVectorMultiply(T , T2);
 5180
 5181    XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
 5182    XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
 5183    XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
 5184    XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
 5185
 5186    XMVECTOR Result = XMVectorMultiply(P0, Position0);
 5187    Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
 5188    Result = XMVectorMultiplyAdd(P1, Position1, Result);
 5189    Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
 5190
 5191    return Result;
 5192
 5193#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5194    static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
 5195    static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
 5196
 5197    XMVECTOR T2 = vmulq_f32(T,T);
 5198    XMVECTOR T3 = vmulq_f32(T,T2);
 5199    // Mul by the constants against t^2
 5200    T2 = vmulq_f32(T2,CatMulT2);
 5201    // Mul by the constants against t^3
 5202    T3 = vmlaq_f32(T2, T3, CatMulT3 );
 5203    // T3 now has the pre-result.
 5204    // I need to add t.y only
 5205    T2 = vandq_u32(T,g_XMMaskY);
 5206    T3 = vaddq_f32(T3,T2);
 5207    // Add 1.0f to x
 5208    T3 = vaddq_f32(T3,g_XMIdentityR0);
 5209    // Now, I have the constants created
 5210    // Mul the x constant to Position0
 5211    XMVECTOR vResult = vdupq_lane_f32( vget_low_f32( T3 ), 0 ); // T3[0]
 5212    vResult = vmulq_f32(vResult,Position0);
 5213    // Mul the y constant to Tangent0
 5214    T2 = vdupq_lane_f32( vget_low_f32( T3 ), 1 ); // T3[1]
 5215    vResult = vmlaq_f32(vResult, T2, Tangent0 );
 5216    // Mul the z constant to Position1
 5217    T2 = vdupq_lane_f32( vget_high_f32( T3 ), 0 ); // T3[2]
 5218    vResult = vmlaq_f32(vResult, T2, Position1 );
 5219    // Mul the w constant to Tangent1
 5220    T3 = vdupq_lane_f32( vget_high_f32( T3 ), 1 ); // T3[3]
 5221    vResult = vmlaq_f32(vResult, T3, Tangent1 );
 5222    return vResult;
 5223#elif defined(_XM_SSE_INTRINSICS_)
 5224    static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
 5225    static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
 5226
 5227    XMVECTOR T2 = _mm_mul_ps(T,T);
 5228    XMVECTOR T3 = _mm_mul_ps(T,T2);
 5229    // Mul by the constants against t^2
 5230    T2 = _mm_mul_ps(T2,CatMulT2);
 5231    // Mul by the constants against t^3
 5232    T3 = _mm_mul_ps(T3,CatMulT3);
 5233    // T3 now has the pre-result.
 5234    T3 = _mm_add_ps(T3,T2);
 5235    // I need to add t.y only
 5236    T2 = _mm_and_ps(T,g_XMMaskY);
 5237    T3 = _mm_add_ps(T3,T2);
 5238    // Add 1.0f to x
 5239    T3 = _mm_add_ps(T3,g_XMIdentityR0);
 5240    // Now, I have the constants created
 5241    // Mul the x constant to Position0
 5242    XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0));
 5243    vResult = _mm_mul_ps(vResult,Position0);
 5244    // Mul the y constant to Tangent0
 5245    T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1));
 5246    T2 = _mm_mul_ps(T2,Tangent0);
 5247    vResult = _mm_add_ps(vResult,T2);
 5248    // Mul the z constant to Position1
 5249    T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2));
 5250    T2 = _mm_mul_ps(T2,Position1);
 5251    vResult = _mm_add_ps(vResult,T2);
 5252    // Mul the w constant to Tangent1
 5253    T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3));
 5254    T3 = _mm_mul_ps(T3,Tangent1);
 5255    vResult = _mm_add_ps(vResult,T3);
 5256    return vResult;
 5257#else // _XM_VMX128_INTRINSICS_
 5258#endif // _XM_VMX128_INTRINSICS_
 5259}
 5260
 5261//------------------------------------------------------------------------------
 5262
 5263inline XMVECTOR XMVectorCatmullRom
 5264(
 5265    FXMVECTOR Position0, 
 5266    FXMVECTOR Position1, 
 5267    FXMVECTOR Position2, 
 5268    GXMVECTOR Position3, 
 5269    float    t
 5270)
 5271{
 5272    // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
 5273    //           (3 * t^3 - 5 * t^2 + 2) * Position1 +
 5274    //           (-3 * t^3 + 4 * t^2 + t) * Position2 +
 5275    //           (t^3 - t^2) * Position3) * 0.5
 5276
 5277#if defined(_XM_NO_INTRINSICS_)
 5278
 5279    float t2 = t * t;
 5280    float t3 = t * t2;
 5281
 5282    XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
 5283    XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
 5284    XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
 5285    XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);
 5286
 5287    XMVECTOR Result = XMVectorMultiply(P0, Position0);
 5288    Result = XMVectorMultiplyAdd(P1, Position1, Result);
 5289    Result = XMVectorMultiplyAdd(P2, Position2, Result);
 5290    Result = XMVectorMultiplyAdd(P3, Position3, Result);
 5291
 5292    return Result;
 5293
 5294#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5295    float t2 = t * t;
 5296    float t3 = t * t2;
 5297
 5298    XMVECTOR P0 = vdupq_n_f32((-t3 + 2.0f * t2 - t) * 0.5f);
 5299    XMVECTOR P1 = vdupq_n_f32((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
 5300    XMVECTOR P2 = vdupq_n_f32((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
 5301    XMVECTOR P3 = vdupq_n_f32((t3 - t2) * 0.5f);
 5302
 5303    P1 = vmulq_f32(P1, Position1);
 5304    P0 = vmlaq_f32(P1, P0, Position0);
 5305    P3 = vmulq_f32(P3, Position3);
 5306    P2 = vmlaq_f32(P3, P2, Position2);
 5307    P0 = vaddq_f32(P0,P2);
 5308    return P0;
 5309#elif defined(_XM_SSE_INTRINSICS_)
 5310    float t2 = t * t;
 5311    float t3 = t * t2;
 5312
 5313    XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
 5314    XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
 5315    XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
 5316    XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
 5317
 5318    P0 = _mm_mul_ps(P0, Position0);
 5319    P1 = _mm_mul_ps(P1, Position1);
 5320    P2 = _mm_mul_ps(P2, Position2);
 5321    P3 = _mm_mul_ps(P3, Position3);
 5322    P0 = _mm_add_ps(P0,P1);
 5323    P2 = _mm_add_ps(P2,P3);
 5324    P0 = _mm_add_ps(P0,P2);
 5325    return P0;
 5326#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 5327#endif // _XM_VMX128_INTRINSICS_
 5328}
 5329
 5330//------------------------------------------------------------------------------
 5331
 5332inline XMVECTOR XMVectorCatmullRomV
 5333(
 5334    FXMVECTOR Position0, 
 5335    FXMVECTOR Position1, 
 5336    FXMVECTOR Position2, 
 5337    GXMVECTOR Position3, 
 5338    CXMVECTOR T
 5339)
 5340{
 5341#if defined(_XM_NO_INTRINSICS_)
 5342    float fx = T.vector4_f32[0];
 5343    float fy = T.vector4_f32[1];
 5344    float fz = T.vector4_f32[2];
 5345    float fw = T.vector4_f32[3];
 5346    XMVECTOR vResult = {
 5347        0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]+
 5348        (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]+
 5349        (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]+
 5350        (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]),
 5351        0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]+
 5352        (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]+
 5353        (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]+
 5354        (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]),
 5355        0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]+
 5356        (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]+
 5357        (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]+
 5358        (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]),
 5359        0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]+
 5360        (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]+
 5361        (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]+
 5362        (fw*fw*fw-fw*fw)*Position3.vector4_f32[3])
 5363    };
 5364    return vResult;
 5365#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5366    static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
 5367    static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
 5368    static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
 5369    static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
 5370    // Cache T^2 and T^3
 5371    XMVECTOR T2 = vmulq_f32(T,T);
 5372    XMVECTOR T3 = vmulq_f32(T,T2);
 5373    // Perform the Position0 term
 5374    XMVECTOR vResult = vaddq_f32(T2,T2);
 5375    vResult = vsubq_f32(vResult,T);
 5376    vResult = vsubq_f32(vResult,T3);
 5377    vResult = vmulq_f32(vResult,Position0);
 5378    // Perform the Position1 term and add
 5379    XMVECTOR vTemp = vmulq_f32(T3,Catmul3);
 5380    vTemp = vmlsq_f32(vTemp, T2, Catmul5);
 5381    vTemp = vaddq_f32(vTemp,Catmul2);
 5382    vResult = vmlaq_f32(vResult, vTemp, Position1);
 5383    // Perform the Position2 term and add
 5384    vTemp = vmulq_f32(T2,Catmul4);
 5385    vTemp = vmlsq_f32(vTemp, T3, Catmul3);
 5386    vTemp = vaddq_f32(vTemp,T);
 5387    vResult = vmlaq_f32(vResult, vTemp, Position2);
 5388    // Position3 is the last term
 5389    T3 = vsubq_f32(T3,T2);
 5390    vResult = vmlaq_f32(vResult, T3, Position3);
 5391    // Multiply by 0.5f and exit
 5392    vResult = vmulq_f32(vResult,g_XMOneHalf);
 5393    return vResult;
 5394#elif defined(_XM_SSE_INTRINSICS_)
 5395    static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
 5396    static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
 5397    static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
 5398    static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
 5399    // Cache T^2 and T^3
 5400    XMVECTOR T2 = _mm_mul_ps(T,T);
 5401    XMVECTOR T3 = _mm_mul_ps(T,T2);
 5402    // Perform the Position0 term
 5403    XMVECTOR vResult = _mm_add_ps(T2,T2);
 5404    vResult = _mm_sub_ps(vResult,T);
 5405    vResult = _mm_sub_ps(vResult,T3);
 5406    vResult = _mm_mul_ps(vResult,Position0);
 5407    // Perform the Position1 term and add
 5408    XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3);
 5409    XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5);
 5410    vTemp = _mm_sub_ps(vTemp,vTemp2);
 5411    vTemp = _mm_add_ps(vTemp,Catmul2);
 5412    vTemp = _mm_mul_ps(vTemp,Position1);
 5413    vResult = _mm_add_ps(vResult,vTemp);
 5414    // Perform the Position2 term and add
 5415    vTemp = _mm_mul_ps(T2,Catmul4);
 5416    vTemp2 = _mm_mul_ps(T3,Catmul3);
 5417    vTemp = _mm_sub_ps(vTemp,vTemp2);
 5418    vTemp = _mm_add_ps(vTemp,T);
 5419    vTemp = _mm_mul_ps(vTemp,Position2);
 5420    vResult = _mm_add_ps(vResult,vTemp);
 5421    // Position3 is the last term
 5422    T3 = _mm_sub_ps(T3,T2);
 5423    T3 = _mm_mul_ps(T3,Position3);
 5424    vResult = _mm_add_ps(vResult,T3);
 5425    // Multiply by 0.5f and exit
 5426    vResult = _mm_mul_ps(vResult,g_XMOneHalf);
 5427    return vResult;
 5428#else // _XM_VMX128_INTRINSICS_
 5429#endif // _XM_VMX128_INTRINSICS_
 5430}
 5431
 5432//------------------------------------------------------------------------------
 5433
 5434inline XMVECTOR XMVectorBaryCentric
 5435(
 5436    FXMVECTOR Position0, 
 5437    FXMVECTOR Position1, 
 5438    FXMVECTOR Position2, 
 5439    float    f, 
 5440    float    g
 5441)
 5442{
 5443    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
 5444
 5445#if defined(_XM_NO_INTRINSICS_)
 5446
 5447    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
 5448    XMVECTOR ScaleF = XMVectorReplicate(f);
 5449
 5450    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
 5451    XMVECTOR ScaleG = XMVectorReplicate(g);
 5452
 5453    XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
 5454    Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
 5455
 5456    return Result;
 5457
 5458#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5459    XMVECTOR R1 = vsubq_f32(Position1,Position0);
 5460    XMVECTOR SF = vdupq_n_f32(f);
 5461    XMVECTOR R2 = vsubq_f32(Position2,Position0);
 5462    XMVECTOR SG = vdupq_n_f32(g);
 5463    R1 = vmlaq_f32( Position0, R1, SF);
 5464    return vmlaq_f32( R1, R2, SG );
 5465#elif defined(_XM_SSE_INTRINSICS_)
 5466    XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
 5467    XMVECTOR SF = _mm_set_ps1(f);
 5468    XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
 5469    XMVECTOR SG = _mm_set_ps1(g);
 5470    R1 = _mm_mul_ps(R1,SF);
 5471    R2 = _mm_mul_ps(R2,SG);
 5472    R1 = _mm_add_ps(R1,Position0);
 5473    R1 = _mm_add_ps(R1,R2);
 5474    return R1;
 5475#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 5476#endif // _XM_VMX128_INTRINSICS_
 5477}
 5478
 5479//------------------------------------------------------------------------------
 5480
 5481inline XMVECTOR XMVectorBaryCentricV
 5482(
 5483    FXMVECTOR Position0, 
 5484    FXMVECTOR Position1, 
 5485    FXMVECTOR Position2, 
 5486    GXMVECTOR F, 
 5487    CXMVECTOR G
 5488)
 5489{
 5490    // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
 5491
 5492#if defined(_XM_NO_INTRINSICS_)
 5493
 5494    XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
 5495    XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
 5496
 5497    XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
 5498    Result = XMVectorMultiplyAdd(P20, G, Result);
 5499
 5500    return Result;
 5501
 5502#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5503    XMVECTOR R1 = vsubq_f32(Position1,Position0);
 5504    XMVECTOR R2 = vsubq_f32(Position2,Position0);
 5505    R1 = vmlaq_f32( Position0, R1, F );
 5506    return vmlaq_f32( R1, R2, G);
 5507#elif defined(_XM_SSE_INTRINSICS_)
 5508    XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
 5509    XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
 5510    R1 = _mm_mul_ps(R1,F);
 5511    R2 = _mm_mul_ps(R2,G);
 5512    R1 = _mm_add_ps(R1,Position0);
 5513    R1 = _mm_add_ps(R1,R2);
 5514    return R1;
 5515#else // _XM_VMX128_INTRINSICS_
 5516#endif // _XM_VMX128_INTRINSICS_
 5517}
 5518
 5519/****************************************************************************
 5520 *
 5521 * 2D Vector
 5522 *
 5523 ****************************************************************************/
 5524
 5525//------------------------------------------------------------------------------
 5526// Comparison operations
 5527//------------------------------------------------------------------------------
 5528
 5529//------------------------------------------------------------------------------
 5530
 5531inline bool XMVector2Equal
 5532(
 5533    FXMVECTOR V1, 
 5534    FXMVECTOR V2
 5535)
 5536{
 5537#if defined(_XM_NO_INTRINSICS_)
 5538    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
 5539#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5540    __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5541    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
 5542#elif defined(_XM_SSE_INTRINSICS_)
 5543    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 5544// z and w are don't care
 5545    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
 5546#else // _XM_VMX128_INTRINSICS_
 5547    return XMComparisonAllTrue(XMVector2EqualR(V1, V2));
 5548#endif
 5549}
 5550
 5551
 5552//------------------------------------------------------------------------------
 5553
 5554inline uint32_t XMVector2EqualR
 5555(
 5556    FXMVECTOR V1, 
 5557    FXMVECTOR V2
 5558)
 5559{
 5560#if defined(_XM_NO_INTRINSICS_)
 5561
 5562    uint32_t CR = 0;
 5563    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
 5564        (V1.vector4_f32[1] == V2.vector4_f32[1]))
 5565    {
 5566        CR = XM_CRMASK_CR6TRUE;
 5567    }
 5568    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
 5569        (V1.vector4_f32[1] != V2.vector4_f32[1]))
 5570    {
 5571        CR = XM_CRMASK_CR6FALSE;
 5572    }
 5573    return CR;
 5574
 5575#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5576    __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5577    uint64_t r = vget_lane_u64( vTemp, 0 );
 5578    uint32_t CR = 0;
 5579    if ( r == 0xFFFFFFFFFFFFFFFFU )
 5580    {
 5581        CR = XM_CRMASK_CR6TRUE;
 5582    }
 5583    else if ( !r )
 5584    {
 5585        CR = XM_CRMASK_CR6FALSE;
 5586    }
 5587    return CR;
 5588#elif defined(_XM_SSE_INTRINSICS_)
 5589    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 5590// z and w are don't care
 5591    int iTest = _mm_movemask_ps(vTemp)&3;
 5592    uint32_t CR = 0;
 5593    if (iTest==3)
 5594    {
 5595        CR = XM_CRMASK_CR6TRUE;
 5596    }
 5597    else if (!iTest)
 5598    {
 5599        CR = XM_CRMASK_CR6FALSE;
 5600    }
 5601    return CR;
 5602#else // _XM_VMX128_INTRINSICS_
 5603#endif // _XM_VMX128_INTRINSICS_
 5604}
 5605
 5606//------------------------------------------------------------------------------
 5607
 5608inline bool XMVector2EqualInt
 5609(
 5610    FXMVECTOR V1, 
 5611    FXMVECTOR V2
 5612)
 5613{
 5614#if defined(_XM_NO_INTRINSICS_)
 5615    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
 5616#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5617    __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
 5618    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
 5619#elif defined(_XM_SSE_INTRINSICS_)
 5620    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 5621    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0);
 5622#else // _XM_VMX128_INTRINSICS_
 5623    return XMComparisonAllTrue(XMVector2EqualIntR(V1, V2));
 5624#endif
 5625}
 5626
 5627//------------------------------------------------------------------------------
 5628
 5629inline uint32_t XMVector2EqualIntR
 5630(
 5631    FXMVECTOR V1, 
 5632    FXMVECTOR V2
 5633)
 5634{
 5635#if defined(_XM_NO_INTRINSICS_)
 5636
 5637    uint32_t CR = 0;
 5638    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 
 5639        (V1.vector4_u32[1] == V2.vector4_u32[1]))
 5640    {
 5641        CR = XM_CRMASK_CR6TRUE;
 5642    }
 5643    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 
 5644        (V1.vector4_u32[1] != V2.vector4_u32[1]))
 5645    {
 5646        CR = XM_CRMASK_CR6FALSE;
 5647    }
 5648    return CR;
 5649
 5650#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5651    __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
 5652    uint64_t r = vget_lane_u64( vTemp, 0 );
 5653    uint32_t CR = 0;
 5654    if ( r == 0xFFFFFFFFFFFFFFFFU )
 5655    {
 5656        CR = XM_CRMASK_CR6TRUE;
 5657    }
 5658    else if ( !r )
 5659    {
 5660        CR = XM_CRMASK_CR6FALSE;
 5661    }
 5662    return CR;
 5663#elif defined(_XM_SSE_INTRINSICS_)
 5664    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 5665    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3;
 5666    uint32_t CR = 0;
 5667    if (iTest==3)
 5668    {
 5669        CR = XM_CRMASK_CR6TRUE;
 5670    }
 5671    else if (!iTest)
 5672    {
 5673        CR = XM_CRMASK_CR6FALSE;
 5674    }
 5675    return CR;
 5676#else // _XM_VMX128_INTRINSICS_
 5677#endif // _XM_VMX128_INTRINSICS_
 5678}
 5679
 5680//------------------------------------------------------------------------------
 5681
 5682inline bool XMVector2NearEqual
 5683(
 5684    FXMVECTOR V1, 
 5685    FXMVECTOR V2, 
 5686    FXMVECTOR Epsilon
 5687)
 5688{
 5689#if defined(_XM_NO_INTRINSICS_)
 5690    float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
 5691    float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
 5692    return ((dx <= Epsilon.vector4_f32[0]) &&
 5693            (dy <= Epsilon.vector4_f32[1]));
 5694#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5695    __n64 vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2));
 5696    __n64 vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) );
 5697    uint64_t r = vget_lane_u64( vTemp, 0 );
 5698    return ( r == 0xFFFFFFFFFFFFFFFFU );
 5699#elif defined(_XM_SSE_INTRINSICS_)
 5700    // Get the difference
 5701    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
 5702    // Get the absolute value of the difference
 5703    XMVECTOR vTemp = _mm_setzero_ps();
 5704    vTemp = _mm_sub_ps(vTemp,vDelta);
 5705    vTemp = _mm_max_ps(vTemp,vDelta);
 5706    vTemp = _mm_cmple_ps(vTemp,Epsilon);
 5707    // z and w are don't care
 5708    return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0);
 5709#else // _XM_VMX128_INTRINSICS_
 5710#endif // _XM_VMX128_INTRINSICS_
 5711}
 5712
 5713//------------------------------------------------------------------------------
 5714
 5715inline bool XMVector2NotEqual
 5716(
 5717    FXMVECTOR V1, 
 5718    FXMVECTOR V2
 5719)
 5720{
 5721#if defined(_XM_NO_INTRINSICS_)
 5722    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
 5723#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5724    __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5725    return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
 5726#elif defined(_XM_SSE_INTRINSICS_)
 5727    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 5728// z and w are don't care
 5729    return (((_mm_movemask_ps(vTemp)&3)!=3) != 0);
 5730#else // _XM_VMX128_INTRINSICS_
 5731    return XMComparisonAnyFalse(XMVector2EqualR(V1, V2));
 5732#endif
 5733}
 5734
 5735//------------------------------------------------------------------------------
 5736
 5737inline bool XMVector2NotEqualInt
 5738(
 5739    FXMVECTOR V1, 
 5740    FXMVECTOR V2
 5741)
 5742{
 5743#if defined(_XM_NO_INTRINSICS_)
 5744    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
 5745#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5746    __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
 5747    return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
 5748#elif defined(_XM_SSE_INTRINSICS_)
 5749    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 5750    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0);
 5751#else // _XM_VMX128_INTRINSICS_
 5752    return XMComparisonAnyFalse(XMVector2EqualIntR(V1, V2));
 5753#endif
 5754}
 5755
 5756//------------------------------------------------------------------------------
 5757
 5758inline bool XMVector2Greater
 5759(
 5760    FXMVECTOR V1, 
 5761    FXMVECTOR V2
 5762)
 5763{
 5764#if defined(_XM_NO_INTRINSICS_)
 5765    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
 5766#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5767    __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5768    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
 5769#elif defined(_XM_SSE_INTRINSICS_)
 5770    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
 5771// z and w are don't care
 5772    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
 5773#else // _XM_VMX128_INTRINSICS_
 5774    return XMComparisonAllTrue(XMVector2GreaterR(V1, V2));
 5775#endif
 5776}
 5777
 5778//------------------------------------------------------------------------------
 5779
 5780inline uint32_t XMVector2GreaterR
 5781(
 5782    FXMVECTOR V1, 
 5783    FXMVECTOR V2
 5784)
 5785{
 5786#if defined(_XM_NO_INTRINSICS_)
 5787
 5788    uint32_t CR = 0;
 5789    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 
 5790        (V1.vector4_f32[1] > V2.vector4_f32[1]))
 5791    {
 5792        CR = XM_CRMASK_CR6TRUE;
 5793    }
 5794    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 
 5795        (V1.vector4_f32[1] <= V2.vector4_f32[1]))
 5796    {
 5797        CR = XM_CRMASK_CR6FALSE;
 5798    }
 5799    return CR;
 5800
 5801#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5802    __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5803    uint64_t r = vget_lane_u64( vTemp, 0 );
 5804    uint32_t CR = 0;
 5805    if ( r == 0xFFFFFFFFFFFFFFFFU )
 5806    {
 5807        CR = XM_CRMASK_CR6TRUE;
 5808    }
 5809    else if ( !r )
 5810    {
 5811        CR = XM_CRMASK_CR6FALSE;
 5812    }
 5813    return CR;
 5814#elif defined(_XM_SSE_INTRINSICS_)
 5815    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
 5816    int iTest = _mm_movemask_ps(vTemp)&3;
 5817    uint32_t CR = 0;
 5818    if (iTest==3)
 5819    {
 5820        CR = XM_CRMASK_CR6TRUE;
 5821    }
 5822    else if (!iTest)
 5823    {
 5824        CR = XM_CRMASK_CR6FALSE;
 5825    }
 5826    return CR;
 5827#else // _XM_VMX128_INTRINSICS_
 5828#endif // _XM_VMX128_INTRINSICS_
 5829}
 5830
 5831//------------------------------------------------------------------------------
 5832
 5833inline bool XMVector2GreaterOrEqual
 5834(
 5835    FXMVECTOR V1, 
 5836    FXMVECTOR V2
 5837)
 5838{
 5839#if defined(_XM_NO_INTRINSICS_)
 5840    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
 5841#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5842    __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5843    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
 5844#elif defined(_XM_SSE_INTRINSICS_)
 5845    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
 5846    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
 5847#else // _XM_VMX128_INTRINSICS_
 5848    return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V1, V2));
 5849#endif
 5850}
 5851
 5852//------------------------------------------------------------------------------
 5853
 5854inline uint32_t XMVector2GreaterOrEqualR
 5855(
 5856    FXMVECTOR V1, 
 5857    FXMVECTOR V2
 5858)
 5859{
 5860#if defined(_XM_NO_INTRINSICS_)
 5861
 5862    uint32_t CR = 0;
 5863    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
 5864        (V1.vector4_f32[1] >= V2.vector4_f32[1]))
 5865    {
 5866        CR = XM_CRMASK_CR6TRUE;
 5867    }
 5868    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
 5869        (V1.vector4_f32[1] < V2.vector4_f32[1]))
 5870    {
 5871        CR = XM_CRMASK_CR6FALSE;
 5872    }
 5873    return CR;
 5874
 5875#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5876    __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5877    uint64_t r = vget_lane_u64( vTemp, 0 );
 5878    uint32_t CR = 0;
 5879    if ( r == 0xFFFFFFFFFFFFFFFFU )
 5880    {
 5881        CR = XM_CRMASK_CR6TRUE;
 5882    }
 5883    else if ( !r )
 5884    {
 5885        CR = XM_CRMASK_CR6FALSE;
 5886    }
 5887    return CR;
 5888#elif defined(_XM_SSE_INTRINSICS_)
 5889    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
 5890    int iTest = _mm_movemask_ps(vTemp)&3;
 5891    uint32_t CR = 0;
 5892    if (iTest == 3)
 5893    {
 5894        CR = XM_CRMASK_CR6TRUE;
 5895    }
 5896    else if (!iTest)
 5897    {
 5898        CR = XM_CRMASK_CR6FALSE;
 5899    }
 5900    return CR;
 5901#else // _XM_VMX128_INTRINSICS_
 5902#endif // _XM_VMX128_INTRINSICS_
 5903}
 5904
 5905//------------------------------------------------------------------------------
 5906
 5907inline bool XMVector2Less
 5908(
 5909    FXMVECTOR V1, 
 5910    FXMVECTOR V2
 5911)
 5912{
 5913#if defined(_XM_NO_INTRINSICS_)
 5914    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
 5915#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5916    __n64 vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5917    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
 5918#elif defined(_XM_SSE_INTRINSICS_)
 5919    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
 5920    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
 5921#else // _XM_VMX128_INTRINSICS_
 5922    return XMComparisonAllTrue(XMVector2GreaterR(V2, V1));
 5923#endif
 5924}
 5925
 5926//------------------------------------------------------------------------------
 5927
 5928inline bool XMVector2LessOrEqual
 5929(
 5930    FXMVECTOR V1, 
 5931    FXMVECTOR V2
 5932)
 5933{
 5934#if defined(_XM_NO_INTRINSICS_)
 5935    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
 5936#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5937    __n64 vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) );
 5938    return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
 5939#elif defined(_XM_SSE_INTRINSICS_)
 5940    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
 5941    return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
 5942#else // _XM_VMX128_INTRINSICS_
 5943    return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V2, V1));
 5944#endif
 5945}
 5946
 5947//------------------------------------------------------------------------------
 5948
 5949inline bool XMVector2InBounds
 5950(
 5951    FXMVECTOR V, 
 5952    FXMVECTOR Bounds
 5953)
 5954{
 5955#if defined(_XM_NO_INTRINSICS_)
 5956    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
 5957        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
 5958#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5959    __n64 VL = vget_low_f32( V );
 5960    __n64 B = vget_low_f32( Bounds );
 5961    // Test if less than or equal
 5962    __n64 vTemp1 = vcle_f32(VL,B);
 5963    // Negate the bounds
 5964    __n64 vTemp2 = vneg_f32(B);
 5965    // Test if greater or equal (Reversed)
 5966    vTemp2 = vcle_f32(vTemp2,VL);
 5967    // Blend answers
 5968    vTemp1 = vand_u32(vTemp1,vTemp2);
 5969    // x and y in bounds?
 5970    return ( vget_lane_u64( vTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU );
 5971#elif defined(_XM_SSE_INTRINSICS_)
 5972    // Test if less than or equal
 5973    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
 5974    // Negate the bounds
 5975    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
 5976    // Test if greater or equal (Reversed)
 5977    vTemp2 = _mm_cmple_ps(vTemp2,V);
 5978    // Blend answers
 5979    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
 5980    // x and y in bounds? (z and w are don't care)
 5981    return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0);
 5982#else // _XM_VMX128_INTRINSICS_   
 5983    return XMComparisonAllInBounds(XMVector2InBoundsR(V, Bounds));
 5984#endif
 5985}
 5986
 5987
 5988//------------------------------------------------------------------------------
 5989
 5990inline bool XMVector2IsNaN
 5991(
 5992    FXMVECTOR V
 5993)
 5994{
 5995#if defined(_XM_NO_INTRINSICS_)
 5996    return (XMISNAN(V.vector4_f32[0]) ||
 5997            XMISNAN(V.vector4_f32[1]));
 5998#elif defined(_XM_ARM_NEON_INTRINSICS_)
 5999    __n64 VL = vget_low_f32( V );
 6000    // Test against itself. NaN is always not equal
 6001    __n64 vTempNan = vceq_f32( VL, VL );
 6002    // If x or y are NaN, the mask is zero
 6003    return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU );
 6004#elif defined(_XM_SSE_INTRINSICS_)
 6005    // Test against itself. NaN is always not equal
 6006    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
 6007    // If x or y are NaN, the mask is non-zero
 6008    return ((_mm_movemask_ps(vTempNan)&3) != 0);
 6009#else // _XM_VMX128_INTRINSICS_
 6010#endif // _XM_VMX128_INTRINSICS_
 6011}
 6012
 6013//------------------------------------------------------------------------------
 6014
 6015inline bool XMVector2IsInfinite
 6016(
 6017    FXMVECTOR V
 6018)
 6019{
 6020#if defined(_XM_NO_INTRINSICS_)
 6021
 6022    return (XMISINF(V.vector4_f32[0]) ||
 6023            XMISINF(V.vector4_f32[1]));
 6024#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6025    // Mask off the sign bit
 6026    __n64 vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) );
 6027    // Compare to infinity
 6028    vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) );
 6029    // If any are infinity, the signs are true.
 6030    return vget_lane_u64( vTemp, 0 ) != 0;
 6031#elif defined(_XM_SSE_INTRINSICS_)
 6032    // Mask off the sign bit
 6033    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
 6034    // Compare to infinity
 6035    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
 6036    // If x or z are infinity, the signs are true.
 6037    return ((_mm_movemask_ps(vTemp)&3) != 0);
 6038#else // _XM_VMX128_INTRINSICS_
 6039#endif // _XM_VMX128_INTRINSICS_
 6040}
 6041
 6042//------------------------------------------------------------------------------
 6043// Computation operations
 6044//------------------------------------------------------------------------------
 6045
 6046//------------------------------------------------------------------------------
 6047
 6048inline XMVECTOR XMVector2Dot
 6049(
 6050    FXMVECTOR V1, 
 6051    FXMVECTOR V2
 6052)
 6053{
 6054#if defined(_XM_NO_INTRINSICS_)
 6055
 6056    XMVECTOR Result;
 6057    Result.vector4_f32[0] =
 6058    Result.vector4_f32[1] =
 6059    Result.vector4_f32[2] =
 6060    Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
 6061    return Result;
 6062
 6063#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6064    // Perform the dot product on x and y
 6065    __n64 vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
 6066    vTemp = vpadd_f32( vTemp, vTemp );
 6067    return vcombine_f32( vTemp, vTemp );
 6068#elif defined(_XM_SSE_INTRINSICS_)
 6069    // Perform the dot product on x and y
 6070    XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
 6071    // vTemp has y splatted
 6072    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
 6073    // x+y
 6074    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 6075    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 6076    return vLengthSq;
 6077#else // _XM_VMX128_INTRINSICS_
 6078#endif // _XM_VMX128_INTRINSICS_
 6079}
 6080
 6081//------------------------------------------------------------------------------
 6082
 6083inline XMVECTOR XMVector2Cross
 6084(
 6085    FXMVECTOR V1, 
 6086    FXMVECTOR V2
 6087)
 6088{
 6089    // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
 6090
 6091#if defined(_XM_NO_INTRINSICS_)
 6092    float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
 6093    XMVECTOR vResult = { 
 6094        fCross,
 6095        fCross,
 6096        fCross,
 6097        fCross
 6098    };
 6099    return vResult;
 6100#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6101    static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 };
 6102
 6103    __n64 vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) );
 6104    vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) );
 6105    vTemp = vpadd_f32( vTemp, vTemp );
 6106    return vcombine_f32( vTemp, vTemp );
 6107#elif defined(_XM_SSE_INTRINSICS_)
 6108    // Swap x and y
 6109    XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1));
 6110    // Perform the muls
 6111    vResult = _mm_mul_ps(vResult,V1);
 6112    // Splat y
 6113    XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
 6114    // Sub the values
 6115    vResult = _mm_sub_ss(vResult,vTemp);
 6116    // Splat the cross product
 6117    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0));
 6118    return vResult;
 6119#else // _XM_VMX128_INTRINSICS_
 6120#endif // _XM_VMX128_INTRINSICS_
 6121}
 6122
 6123//------------------------------------------------------------------------------
 6124
 6125inline XMVECTOR XMVector2LengthSq
 6126(
 6127    FXMVECTOR V
 6128)
 6129{
 6130    return XMVector2Dot(V, V);
 6131}
 6132
 6133//------------------------------------------------------------------------------
 6134
 6135inline XMVECTOR XMVector2ReciprocalLengthEst
 6136(
 6137    FXMVECTOR V
 6138)
 6139{
 6140#if defined(_XM_NO_INTRINSICS_)
 6141
 6142    XMVECTOR Result;
 6143    Result = XMVector2LengthSq(V);
 6144    Result = XMVectorReciprocalSqrtEst(Result);
 6145    return Result;
 6146
 6147#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6148    __n64 VL = vget_low_f32(V);
 6149    // Dot2
 6150    __n64 vTemp = vmul_f32( VL, VL );
 6151    vTemp = vpadd_f32( vTemp, vTemp );
 6152    // Reciprocal sqrt (estimate)
 6153    vTemp = vrsqrte_f32( vTemp );
 6154    return vcombine_f32( vTemp, vTemp );
 6155#elif defined(_XM_SSE_INTRINSICS_)
 6156    // Perform the dot product on x and y
 6157    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 6158    // vTemp has y splatted
 6159    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
 6160    // x+y
 6161    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 6162    vLengthSq = _mm_rsqrt_ss(vLengthSq);
 6163    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 6164    return vLengthSq;
 6165#else // _XM_VMX128_INTRINSICS_
 6166#endif // _XM_VMX128_INTRINSICS_
 6167}
 6168
 6169//------------------------------------------------------------------------------
 6170
 6171inline XMVECTOR XMVector2ReciprocalLength
 6172(
 6173    FXMVECTOR V
 6174)
 6175{
 6176#if defined(_XM_NO_INTRINSICS_)
 6177
 6178    XMVECTOR Result;
 6179    Result = XMVector2LengthSq(V);
 6180    Result = XMVectorReciprocalSqrt(Result);
 6181    return Result;
 6182
 6183#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6184    __n64 VL = vget_low_f32(V);
 6185    // Dot2
 6186    __n64 vTemp = vmul_f32( VL, VL );
 6187    vTemp = vpadd_f32( vTemp, vTemp );
 6188    // Reciprocal sqrt
 6189    __n64  S0 = vrsqrte_f32(vTemp);
 6190    __n64  P0 = vmul_f32( vTemp, S0 );
 6191    __n64  R0 = vrsqrts_f32( P0, S0 );
 6192    __n64  S1 = vmul_f32( S0, R0 );
 6193    __n64  P1 = vmul_f32( vTemp, S1 );
 6194    __n64  R1 = vrsqrts_f32( P1, S1 );
 6195    __n64 Result = vmul_f32( S1, R1 );
 6196    return vcombine_f32( Result, Result );
 6197#elif defined(_XM_SSE_INTRINSICS_)
 6198    // Perform the dot product on x and y
 6199    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 6200    // vTemp has y splatted
 6201    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
 6202    // x+y
 6203    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 6204    vLengthSq = _mm_sqrt_ss(vLengthSq);
 6205    vLengthSq = _mm_div_ss(g_XMOne,vLengthSq);
 6206    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 6207    return vLengthSq;
 6208#else // _XM_VMX128_INTRINSICS_
 6209#endif // _XM_VMX128_INTRINSICS_
 6210}
 6211
 6212//------------------------------------------------------------------------------
 6213
 6214inline XMVECTOR XMVector2LengthEst
 6215(
 6216    FXMVECTOR V
 6217)
 6218{
 6219#if defined(_XM_NO_INTRINSICS_)
 6220
 6221    XMVECTOR Result;
 6222    Result = XMVector2LengthSq(V);
 6223    Result = XMVectorSqrtEst(Result);
 6224    return Result;
 6225
 6226#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6227    __n64 VL = vget_low_f32(V);
 6228    // Dot2
 6229    __n64 vTemp = vmul_f32( VL, VL );
 6230    vTemp = vpadd_f32( vTemp, vTemp );
 6231    const __n64 zero = vdup_n_u32(0);
 6232    __n64 VEqualsZero = vceq_f32( vTemp, zero );
 6233    // Sqrt (estimate)
 6234    __n64 Result = vrsqrte_f32( vTemp );
 6235    Result = vmul_f32( vTemp, Result );
 6236    Result = vbsl_f32( VEqualsZero, zero, Result );
 6237    return vcombine_f32( Result, Result );
 6238#elif defined(_XM_SSE_INTRINSICS_)
 6239    // Perform the dot product on x and y
 6240    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 6241    // vTemp has y splatted
 6242    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
 6243    // x+y
 6244    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 6245    vLengthSq = _mm_sqrt_ss(vLengthSq);
 6246    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 6247    return vLengthSq;
 6248#else // _XM_VMX128_INTRINSICS_
 6249#endif // _XM_VMX128_INTRINSICS_
 6250}
 6251
 6252//------------------------------------------------------------------------------
 6253
 6254inline XMVECTOR XMVector2Length
 6255(
 6256    FXMVECTOR V
 6257)
 6258{
 6259#if defined(_XM_NO_INTRINSICS_)
 6260
 6261    XMVECTOR Result;
 6262    Result = XMVector2LengthSq(V);
 6263    Result = XMVectorSqrt(Result);
 6264    return Result;
 6265
 6266#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6267    __n64 VL = vget_low_f32(V);
 6268    // Dot2
 6269    __n64 vTemp = vmul_f32( VL, VL );
 6270    vTemp = vpadd_f32( vTemp, vTemp );
 6271    const __n64 zero = vdup_n_u32(0);
 6272    __n64 VEqualsZero = vceq_f32( vTemp, zero );
 6273    // Sqrt
 6274    __n64 S0 = vrsqrte_f32( vTemp );
 6275    __n64 P0 = vmul_f32( vTemp, S0 );
 6276    __n64 R0 = vrsqrts_f32( P0, S0 );
 6277    __n64 S1 = vmul_f32( S0, R0 );
 6278    __n64 P1 = vmul_f32( vTemp, S1 );
 6279    __n64 R1 = vrsqrts_f32( P1, S1 );
 6280    __n64 Result = vmul_f32( S1, R1 );
 6281    Result = vmul_f32( vTemp, Result );
 6282    Result = vbsl_f32( VEqualsZero, zero, Result );
 6283    return vcombine_f32( Result, Result );
 6284#elif defined(_XM_SSE_INTRINSICS_)
 6285    // Perform the dot product on x and y
 6286    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 6287    // vTemp has y splatted
 6288    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
 6289    // x+y
 6290    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 6291    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 6292    vLengthSq = _mm_sqrt_ps(vLengthSq);
 6293    return vLengthSq;
 6294#else // _XM_VMX128_INTRINSICS_
 6295#endif // _XM_VMX128_INTRINSICS_
 6296}
 6297
 6298//------------------------------------------------------------------------------
 6299// XMVector2NormalizeEst uses a reciprocal estimate and
 6300// returns QNaN on zero and infinite vectors.
 6301
 6302inline XMVECTOR XMVector2NormalizeEst
 6303(
 6304    FXMVECTOR V
 6305)
 6306{
 6307#if defined(_XM_NO_INTRINSICS_)
 6308
 6309    XMVECTOR Result;
 6310    Result = XMVector2ReciprocalLength(V);
 6311    Result = XMVectorMultiply(V, Result);
 6312    return Result;
 6313
 6314#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6315    __n64 VL = vget_low_f32(V);
 6316    // Dot2
 6317    __n64 vTemp = vmul_f32( VL, VL );
 6318    vTemp = vpadd_f32( vTemp, vTemp );
 6319    // Reciprocal sqrt (estimate)
 6320    vTemp = vrsqrte_f32( vTemp );
 6321    // Normalize
 6322    __n64 Result = vmul_f32( VL, vTemp );
 6323    return vcombine_f32( Result, Result );
 6324#elif defined(_XM_SSE_INTRINSICS_)
 6325    // Perform the dot product on x and y
 6326    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 6327    // vTemp has y splatted
 6328    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
 6329    // x+y
 6330    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 6331    vLengthSq = _mm_rsqrt_ss(vLengthSq);
 6332    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 6333    vLengthSq = _mm_mul_ps(vLengthSq,V);
 6334    return vLengthSq;
 6335#else // _XM_VMX128_INTRINSICS_
 6336#endif // _XM_VMX128_INTRINSICS_
 6337}
 6338
 6339//------------------------------------------------------------------------------
 6340
 6341inline XMVECTOR XMVector2Normalize
 6342(
 6343    FXMVECTOR V
 6344)
 6345{
 6346#if defined(_XM_NO_INTRINSICS_)
 6347
 6348    XMVECTOR vResult = XMVector2Length( V );
 6349    float fLength = vResult.vector4_f32[0];
 6350
 6351    // Prevent divide by zero
 6352    if (fLength > 0) {
 6353        fLength = 1.0f/fLength;
 6354    }
 6355    
 6356    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
 6357    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
 6358    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
 6359    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
 6360    return vResult;
 6361
 6362#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6363    __n64 VL = vget_low_f32(V);
 6364    // Dot2
 6365    __n64 vTemp = vmul_f32( VL, VL );
 6366    vTemp = vpadd_f32( vTemp, vTemp );
 6367    __n64 VEqualsZero = vceq_f32( vTemp, vdup_n_u32(0) );
 6368    __n64 VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) );
 6369    // Reciprocal sqrt (2 iterations of Newton-Raphson)
 6370    __n64 S0 = vrsqrte_f32( vTemp );
 6371    __n64 P0 = vmul_f32( vTemp, S0 );
 6372    __n64 R0 = vrsqrts_f32( P0, S0 );
 6373    __n64 S1 = vmul_f32( S0, R0 );
 6374    __n64 P1 = vmul_f32( vTemp, S1 );
 6375    __n64 R1 = vrsqrts_f32( P1, S1 );
 6376    vTemp = vmul_f32( S1, R1 );
 6377    // Normalize
 6378    __n64 Result = vmul_f32( VL, vTemp );
 6379    Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
 6380    Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
 6381    return vcombine_f32( Result, Result );
 6382#elif defined(_XM_SSE_INTRINSICS_)
 6383    // Perform the dot product on x and y only
 6384    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 6385    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
 6386    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 6387    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 6388    // Prepare for the division
 6389    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
 6390    // Create zero with a single instruction
 6391    XMVECTOR vZeroMask = _mm_setzero_ps();
 6392    // Test for a divide by zero (Must be FP to detect -0.0)
 6393    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
 6394    // Failsafe on zero (Or epsilon) length planes
 6395    // If the length is infinity, set the elements to zero
 6396    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
 6397    // Reciprocal mul to perform the normalization
 6398    vResult = _mm_div_ps(V,vResult);
 6399    // Any that are infinity, set to zero
 6400    vResult = _mm_and_ps(vResult,vZeroMask);
 6401    // Select qnan or result based on infinite length
 6402    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
 6403    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
 6404    vResult = _mm_or_ps(vTemp1,vTemp2);
 6405    return vResult;
 6406#else // _XM_VMX128_INTRINSICS_
 6407#endif // _XM_VMX128_INTRINSICS_
 6408}
 6409
 6410//------------------------------------------------------------------------------
 6411
 6412inline XMVECTOR XMVector2ClampLength
 6413(
 6414    FXMVECTOR V, 
 6415    float    LengthMin, 
 6416    float    LengthMax
 6417)
 6418{
 6419    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
 6420    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
 6421    return XMVector2ClampLengthV(V, ClampMin, ClampMax);
 6422}
 6423
 6424//------------------------------------------------------------------------------
 6425
 6426inline XMVECTOR XMVector2ClampLengthV
 6427(
 6428    FXMVECTOR V, 
 6429    FXMVECTOR LengthMin, 
 6430    FXMVECTOR LengthMax
 6431)
 6432{
 6433    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
 6434    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
 6435    assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
 6436    assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
 6437    assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));
 6438
 6439    XMVECTOR LengthSq = XMVector2LengthSq(V);
 6440
 6441    const XMVECTOR Zero = XMVectorZero();
 6442
 6443    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
 6444
 6445    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
 6446    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
 6447
 6448    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
 6449
 6450    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
 6451
 6452    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
 6453    Length = XMVectorSelect(LengthSq, Length, Select);
 6454    Normal = XMVectorSelect(LengthSq, Normal, Select);
 6455
 6456    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
 6457    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
 6458
 6459    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
 6460    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
 6461
 6462    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
 6463
 6464    // Preserve the original vector (with no precision loss) if the length falls within the given range
 6465    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
 6466    Result = XMVectorSelect(Result, V, Control);
 6467
 6468    return Result;
 6469}
 6470
 6471//------------------------------------------------------------------------------
 6472
 6473inline XMVECTOR XMVector2Reflect
 6474(
 6475    FXMVECTOR Incident, 
 6476    FXMVECTOR Normal
 6477)
 6478{
 6479    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
 6480
 6481    XMVECTOR Result;
 6482    Result = XMVector2Dot(Incident, Normal);
 6483    Result = XMVectorAdd(Result, Result);
 6484    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
 6485    return Result;
 6486}
 6487
 6488//------------------------------------------------------------------------------
 6489
 6490inline XMVECTOR XMVector2Refract
 6491(
 6492    FXMVECTOR Incident, 
 6493    FXMVECTOR Normal, 
 6494    float    RefractionIndex
 6495)
 6496{
 6497    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
 6498    return XMVector2RefractV(Incident, Normal, Index);
 6499}
 6500
 6501//------------------------------------------------------------------------------
 6502
 6503// Return the refraction of a 2D vector
 6504inline XMVECTOR XMVector2RefractV
 6505(
 6506    FXMVECTOR Incident, 
 6507    FXMVECTOR Normal, 
 6508    FXMVECTOR RefractionIndex
 6509)
 6510{
 6511    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
 6512    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
 6513
 6514#if defined(_XM_NO_INTRINSICS_)
 6515
 6516    float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]);
 6517    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
 6518    float RY = 1.0f-(IDotN*IDotN);
 6519    float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]);
 6520    RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]);
 6521    if (RX>=0.0f) {
 6522        RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX)));
 6523    } else {
 6524        RX = 0.0f;
 6525    }
 6526    if (RY>=0.0f) {
 6527        RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY)));
 6528    } else {
 6529        RY = 0.0f;
 6530    }
 6531
 6532    XMVECTOR vResult;
 6533    vResult.vector4_f32[0] = RX;
 6534    vResult.vector4_f32[1] = RY;
 6535    vResult.vector4_f32[2] = 0.0f;   
 6536    vResult.vector4_f32[3] = 0.0f;
 6537    return vResult;
 6538
 6539#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6540    __n64 IL = vget_low_f32( Incident );
 6541    __n64 NL = vget_low_f32( Normal );
 6542    __n64 RIL = vget_low_f32( RefractionIndex );
 6543    // Get the 2D Dot product of Incident-Normal
 6544    __n64 vTemp = vmul_f32(IL, NL);
 6545    __n64 IDotN = vpadd_f32( vTemp, vTemp );
 6546    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
 6547    vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN);
 6548    vTemp = vmul_f32(vTemp,RIL);
 6549    vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL );
 6550    // If any terms are <=0, sqrt() will fail, punt to zero
 6551    __n64 vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) );
 6552    // Sqrt(vTemp)
 6553    __n64 S0 = vrsqrte_f32(vTemp);
 6554    __n64 P0 = vmul_f32( vTemp, S0 );
 6555    __n64 R0 = vrsqrts_f32( P0, S0 );
 6556    __n64 S1 = vmul_f32( S0, R0 );
 6557    __n64 P1 = vmul_f32( vTemp, S1 );
 6558    __n64 R1 = vrsqrts_f32( P1, S1 );
 6559    __n64 S2 = vmul_f32( S1, R1 );
 6560    vTemp = vmul_f32( vTemp, S2 );
 6561    // R = RefractionIndex * IDotN + sqrt(R)
 6562    vTemp = vmla_f32( vTemp, RIL, IDotN );
 6563    // Result = RefractionIndex * Incident - Normal * R
 6564    __n64 vResult = vmul_f32(RIL,IL);
 6565    vResult = vmls_f32( vResult, vTemp, NL );
 6566    vResult = vand_u32(vResult,vMask);
 6567    return vcombine_f32(vResult, vResult);
 6568#elif defined(_XM_SSE_INTRINSICS_)
 6569    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
 6570    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
 6571    // Get the 2D Dot product of Incident-Normal
 6572    XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
 6573    // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
 6574    XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN);
 6575    vTemp = _mm_sub_ps(g_XMOne,vTemp);
 6576    vTemp = _mm_mul_ps(vTemp,RefractionIndex);
 6577    vTemp = _mm_mul_ps(vTemp,RefractionIndex);
 6578    vTemp = _mm_sub_ps(g_XMOne,vTemp);
 6579    // If any terms are <=0, sqrt() will fail, punt to zero
 6580    XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero);
 6581    // R = RefractionIndex * IDotN + sqrt(R)
 6582    vTemp = _mm_sqrt_ps(vTemp);
 6583    XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN);
 6584    vTemp = _mm_add_ps(vTemp,vResult);
 6585    // Result = RefractionIndex * Incident - Normal * R
 6586    vResult = _mm_mul_ps(RefractionIndex,Incident);
 6587    vTemp = _mm_mul_ps(vTemp,Normal);
 6588    vResult = _mm_sub_ps(vResult,vTemp);
 6589    vResult = _mm_and_ps(vResult,vMask);
 6590    return vResult;
 6591#else // _XM_VMX128_INTRINSICS_
 6592#endif // _XM_VMX128_INTRINSICS_
 6593}
 6594
 6595//------------------------------------------------------------------------------
 6596
 6597inline XMVECTOR XMVector2Orthogonal
 6598(
 6599    FXMVECTOR V
 6600)
 6601{
 6602#if defined(_XM_NO_INTRINSICS_)
 6603
 6604    XMVECTOR Result;
 6605    Result.vector4_f32[0] = -V.vector4_f32[1];
 6606    Result.vector4_f32[1] = V.vector4_f32[0];
 6607    Result.vector4_f32[2] = 0.f;
 6608    Result.vector4_f32[3] = 0.f;
 6609    return Result;
 6610
 6611#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6612    static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 };
 6613    const __n64 zero = vdup_n_f32(0);
 6614
 6615    __n64 VL = vget_low_f32( V );
 6616    __n64 Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) );
 6617    return vcombine_f32( Result, zero );
 6618#elif defined(_XM_SSE_INTRINSICS_)
 6619    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
 6620    vResult = _mm_mul_ps(vResult,g_XMNegateX);
 6621    return vResult;
 6622#else // _XM_VMX128_INTRINSICS_
 6623#endif // _XM_VMX128_INTRINSICS_
 6624}
 6625
 6626//------------------------------------------------------------------------------
 6627
 6628inline XMVECTOR XMVector2AngleBetweenNormalsEst
 6629(
 6630    FXMVECTOR N1, 
 6631    FXMVECTOR N2
 6632)
 6633{
 6634#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 
 6635
 6636    XMVECTOR Result = XMVector2Dot(N1, N2);
 6637    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
 6638    Result = XMVectorACosEst(Result);
 6639    return Result;
 6640
 6641#else // _XM_VMX128_INTRINSICS_
 6642#endif // _XM_VMX128_INTRINSICS_
 6643}
 6644
 6645//------------------------------------------------------------------------------
 6646
 6647inline XMVECTOR XMVector2AngleBetweenNormals
 6648(
 6649    FXMVECTOR N1, 
 6650    FXMVECTOR N2
 6651)
 6652{
 6653#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 6654
 6655    XMVECTOR Result = XMVector2Dot(N1, N2);
 6656    Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
 6657    Result = XMVectorACos(Result);
 6658    return Result;
 6659
 6660#else // _XM_VMX128_INTRINSICS_
 6661#endif // _XM_VMX128_INTRINSICS_
 6662}
 6663
 6664//------------------------------------------------------------------------------
 6665
 6666inline XMVECTOR XMVector2AngleBetweenVectors
 6667(
 6668    FXMVECTOR V1, 
 6669    FXMVECTOR V2
 6670)
 6671{
 6672#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 6673
 6674    XMVECTOR L1 = XMVector2ReciprocalLength(V1);
 6675    XMVECTOR L2 = XMVector2ReciprocalLength(V2);
 6676
 6677    XMVECTOR Dot = XMVector2Dot(V1, V2);
 6678
 6679    L1 = XMVectorMultiply(L1, L2);
 6680
 6681    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
 6682    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
 6683
 6684    return XMVectorACos(CosAngle);
 6685
 6686#else // _XM_VMX128_INTRINSICS_
 6687#endif // _XM_VMX128_INTRINSICS_
 6688}
 6689
 6690//------------------------------------------------------------------------------
 6691
 6692inline XMVECTOR XMVector2LinePointDistance
 6693(
 6694    FXMVECTOR LinePoint1, 
 6695    FXMVECTOR LinePoint2, 
 6696    FXMVECTOR Point
 6697)
 6698{
 6699    // Given a vector PointVector from LinePoint1 to Point and a vector
 6700    // LineVector from LinePoint1 to LinePoint2, the scaled distance 
 6701    // PointProjectionScale from LinePoint1 to the perpendicular projection
 6702    // of PointVector onto the line is defined as:
 6703    //
 6704    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
 6705
 6706#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 6707
 6708    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
 6709    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
 6710
 6711    XMVECTOR LengthSq = XMVector2LengthSq(LineVector);
 6712
 6713    XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
 6714    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
 6715
 6716    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
 6717    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
 6718
 6719    return XMVector2Length(DistanceVector);
 6720
 6721#else // _XM_VMX128_INTRINSICS_
 6722#endif // _XM_VMX128_INTRINSICS_
 6723}
 6724
 6725//------------------------------------------------------------------------------
 6726
 6727inline XMVECTOR XMVector2IntersectLine
 6728(
 6729    FXMVECTOR Line1Point1, 
 6730    FXMVECTOR Line1Point2, 
 6731    FXMVECTOR Line2Point1, 
 6732    GXMVECTOR Line2Point2
 6733)
 6734{
 6735#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 6736
 6737    XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
 6738    XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
 6739    XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);
 6740
 6741    XMVECTOR C1 = XMVector2Cross(V1, V2);
 6742    XMVECTOR C2 = XMVector2Cross(V2, V3);
 6743
 6744    XMVECTOR Result;
 6745    const XMVECTOR Zero = XMVectorZero();
 6746    if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
 6747    {
 6748        if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
 6749        {
 6750            // Coincident
 6751            Result = g_XMInfinity.v;
 6752        }
 6753        else
 6754        {
 6755            // Parallel
 6756            Result = g_XMQNaN.v;
 6757        }
 6758    }
 6759    else
 6760    {
 6761        // Intersection point = Line1Point1 + V1 * (C2 / C1)
 6762        XMVECTOR Scale = XMVectorReciprocal(C1);
 6763        Scale = XMVectorMultiply(C2, Scale);
 6764        Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
 6765    }
 6766
 6767    return Result;
 6768
 6769#elif defined(_XM_SSE_INTRINSICS_)
 6770    XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
 6771    XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
 6772    XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
 6773    // Generate the cross products
 6774    XMVECTOR C1 = XMVector2Cross(V1, V2);
 6775    XMVECTOR C2 = XMVector2Cross(V2, V3);
 6776    // If C1 is not close to epsilon, use the calculated value
 6777    XMVECTOR vResultMask = _mm_setzero_ps();
 6778    vResultMask = _mm_sub_ps(vResultMask,C1);
 6779    vResultMask = _mm_max_ps(vResultMask,C1);
 6780    // 0xFFFFFFFF if the calculated value is to be used
 6781    vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon);
 6782    // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
 6783    XMVECTOR vFailMask = _mm_setzero_ps();
 6784    vFailMask = _mm_sub_ps(vFailMask,C2);
 6785    vFailMask = _mm_max_ps(vFailMask,C2);
 6786    vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon);
 6787    XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity);
 6788    vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN);
 6789    // vFail is NAN or INF
 6790    vFail = _mm_or_ps(vFail,vFailMask);
 6791    // Intersection point = Line1Point1 + V1 * (C2 / C1)
 6792    XMVECTOR vResult = _mm_div_ps(C2,C1);
 6793    vResult = _mm_mul_ps(vResult,V1);
 6794    vResult = _mm_add_ps(vResult,Line1Point1);
 6795    // Use result, or failure value
 6796    vResult = _mm_and_ps(vResult,vResultMask);
 6797    vResultMask = _mm_andnot_ps(vResultMask,vFail);
 6798    vResult = _mm_or_ps(vResult,vResultMask);
 6799    return vResult;
 6800#else // _XM_VMX128_INTRINSICS_
 6801#endif // _XM_VMX128_INTRINSICS_
 6802}
 6803
 6804//------------------------------------------------------------------------------
 6805
 6806inline XMVECTOR XMVector2Transform
 6807(
 6808    FXMVECTOR V, 
 6809    CXMMATRIX M
 6810)
 6811{
 6812#if defined(_XM_NO_INTRINSICS_)
 6813
 6814    XMVECTOR Y = XMVectorSplatY(V);
 6815    XMVECTOR X = XMVectorSplatX(V);
 6816
 6817    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
 6818    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
 6819
 6820    return Result;
 6821
 6822#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6823    __n64 VL = vget_low_f32( V );
 6824    __n128 Y = vdupq_lane_f32( VL, 1 );
 6825    __n128 Result = vmlaq_f32( M.r[3], Y, M.r[1] );
 6826    __n128 X = vdupq_lane_f32( VL, 0 );
 6827    return vmlaq_f32( Result, X, M.r[0] );
 6828#elif defined(_XM_SSE_INTRINSICS_)
 6829    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
 6830    vResult = _mm_mul_ps(vResult,M.r[0]);
 6831    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
 6832    vTemp = _mm_mul_ps(vTemp,M.r[1]);
 6833    vResult = _mm_add_ps(vResult,vTemp);
 6834    vResult = _mm_add_ps(vResult,M.r[3]);
 6835    return vResult;
 6836#else // _XM_VMX128_INTRINSICS_
 6837#endif // _XM_VMX128_INTRINSICS_
 6838}
 6839
 6840//------------------------------------------------------------------------------
 6841
 6842_Use_decl_annotations_
 6843inline XMFLOAT4* XMVector2TransformStream
 6844(
 6845    XMFLOAT4*       pOutputStream, 
 6846    size_t          OutputStride, 
 6847    const XMFLOAT2* pInputStream, 
 6848    size_t          InputStride, 
 6849    size_t          VectorCount, 
 6850    CXMMATRIX       M
 6851)
 6852{
 6853    assert(pOutputStream != NULL);
 6854    assert(pInputStream != NULL);
 6855
 6856#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 6857
 6858    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 6859    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
 6860
 6861    const XMVECTOR row0 = M.r[0];
 6862    const XMVECTOR row1 = M.r[1];
 6863    const XMVECTOR row3 = M.r[3];
 6864
 6865    for (size_t i = 0; i < VectorCount; i++)
 6866    {
 6867        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
 6868        XMVECTOR Y = XMVectorSplatY(V);
 6869        XMVECTOR X = XMVectorSplatX(V);
 6870
 6871        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
 6872        Result = XMVectorMultiplyAdd(X, row0, Result);
 6873
 6874        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
 6875
 6876        pInputVector += InputStride; 
 6877        pOutputVector += OutputStride;
 6878    }
 6879
 6880    return pOutputStream;
 6881
 6882#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 6883#endif // _XM_VMX128_INTRINSICS_
 6884}
 6885
 6886
 6887//------------------------------------------------------------------------------
 6888
 6889inline XMVECTOR XMVector2TransformCoord
 6890(
 6891    FXMVECTOR V, 
 6892    CXMMATRIX M
 6893)
 6894{
 6895#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 6896
 6897    XMVECTOR Y = XMVectorSplatY(V);
 6898    XMVECTOR X = XMVectorSplatX(V);
 6899
 6900    XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
 6901    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
 6902
 6903    XMVECTOR W = XMVectorSplatW(Result);
 6904    return XMVectorDivide( Result, W );
 6905
 6906#else // _XM_VMX128_INTRINSICS_
 6907#endif // _XM_VMX128_INTRINSICS_
 6908}
 6909
 6910//------------------------------------------------------------------------------
 6911
 6912_Use_decl_annotations_
 6913inline XMFLOAT2* XMVector2TransformCoordStream
 6914(
 6915    XMFLOAT2*       pOutputStream, 
 6916    size_t          OutputStride, 
 6917    const XMFLOAT2* pInputStream, 
 6918    size_t          InputStride, 
 6919    size_t          VectorCount, 
 6920    CXMMATRIX       M
 6921)
 6922{
 6923    assert(pOutputStream != NULL);
 6924    assert(pInputStream != NULL);
 6925
 6926#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 6927
 6928    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 6929    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
 6930
 6931    const XMVECTOR row0 = M.r[0];
 6932    const XMVECTOR row1 = M.r[1];
 6933    const XMVECTOR row3 = M.r[3];
 6934
 6935    for (size_t i = 0; i < VectorCount; i++)
 6936    {
 6937        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
 6938        XMVECTOR Y = XMVectorSplatY(V);
 6939        XMVECTOR X = XMVectorSplatX(V);
 6940
 6941        XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
 6942        Result = XMVectorMultiplyAdd(X, row0, Result);
 6943
 6944        XMVECTOR W = XMVectorSplatW(Result);
 6945
 6946        Result = XMVectorDivide(Result, W);
 6947
 6948        XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
 6949
 6950        pInputVector += InputStride; 
 6951        pOutputVector += OutputStride;
 6952    }
 6953
 6954    return pOutputStream;
 6955
 6956#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 6957#endif // _XM_VMX128_INTRINSICS_
 6958}
 6959
 6960//------------------------------------------------------------------------------
 6961
 6962inline XMVECTOR XMVector2TransformNormal
 6963(
 6964    FXMVECTOR V, 
 6965    CXMMATRIX M
 6966)
 6967{
 6968#if defined(_XM_NO_INTRINSICS_)
 6969
 6970    XMVECTOR Y = XMVectorSplatY(V);
 6971    XMVECTOR X = XMVectorSplatX(V);
 6972
 6973    XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
 6974    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
 6975
 6976    return Result;
 6977
 6978#elif defined(_XM_ARM_NEON_INTRINSICS_)
 6979    __n64 VL = vget_low_f32( V );
 6980    __n128 Y = vdupq_lane_f32( VL, 1 );
 6981    __n128 Result = vmulq_f32( Y, M.r[1] );
 6982    __n128 X = vdupq_lane_f32( VL, 0 );
 6983    return vmlaq_f32( Result, X, M.r[0] );
 6984#elif defined(_XM_SSE_INTRINSICS_)
 6985    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
 6986    vResult = _mm_mul_ps(vResult,M.r[0]);
 6987    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
 6988    vTemp = _mm_mul_ps(vTemp,M.r[1]);
 6989    vResult = _mm_add_ps(vResult,vTemp);
 6990    return vResult;
 6991#else // _XM_VMX128_INTRINSICS_
 6992#endif // _XM_VMX128_INTRINSICS_
 6993}
 6994
 6995//------------------------------------------------------------------------------
 6996
 6997_Use_decl_annotations_
 6998inline XMFLOAT2* XMVector2TransformNormalStream
 6999(
 7000    XMFLOAT2*       pOutputStream, 
 7001    size_t          OutputStride, 
 7002    const XMFLOAT2* pInputStream, 
 7003    size_t          InputStride, 
 7004    size_t          VectorCount, 
 7005    CXMMATRIX       M
 7006)
 7007{
 7008    assert(pOutputStream != NULL);
 7009    assert(pInputStream != NULL);
 7010
 7011#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 7012
 7013    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 7014    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
 7015
 7016    const XMVECTOR row0 = M.r[0];
 7017    const XMVECTOR row1 = M.r[1];
 7018
 7019    for (size_t i = 0; i < VectorCount; i++)
 7020    {
 7021        XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
 7022        XMVECTOR Y = XMVectorSplatY(V);
 7023        XMVECTOR X = XMVectorSplatX(V);
 7024
 7025        XMVECTOR Result = XMVectorMultiply(Y, row1);
 7026        Result = XMVectorMultiplyAdd(X, row0, Result);
 7027
 7028        XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
 7029
 7030        pInputVector += InputStride; 
 7031        pOutputVector += OutputStride;
 7032    }
 7033
 7034    return pOutputStream;
 7035
 7036#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 7037#endif // _XM_VMX128_INTRINSICS_
 7038}
 7039
 7040/****************************************************************************
 7041 *
 7042 * 3D Vector
 7043 *
 7044 ****************************************************************************/
 7045
 7046//------------------------------------------------------------------------------
 7047// Comparison operations
 7048//------------------------------------------------------------------------------
 7049
 7050//------------------------------------------------------------------------------
 7051
 7052inline bool XMVector3Equal
 7053(
 7054    FXMVECTOR V1, 
 7055    FXMVECTOR V2
 7056)
 7057{
 7058#if defined(_XM_NO_INTRINSICS_)
 7059    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
 7060#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7061    __n128 vResult = vceqq_f32( V1, V2 );
 7062    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7063    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7064    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7065#elif defined(_XM_SSE_INTRINSICS_)
 7066    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 7067    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
 7068#else // _XM_VMX128_INTRINSICS_
 7069    return XMComparisonAllTrue(XMVector3EqualR(V1, V2));
 7070#endif
 7071}
 7072
 7073//------------------------------------------------------------------------------
 7074
 7075inline uint32_t XMVector3EqualR
 7076(
 7077    FXMVECTOR V1, 
 7078    FXMVECTOR V2
 7079)
 7080{
 7081#if defined(_XM_NO_INTRINSICS_)
 7082    uint32_t CR = 0;
 7083    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
 7084        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
 7085        (V1.vector4_f32[2] == V2.vector4_f32[2]))
 7086    {
 7087        CR = XM_CRMASK_CR6TRUE;
 7088    }
 7089    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
 7090        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
 7091        (V1.vector4_f32[2] != V2.vector4_f32[2]))
 7092    {
 7093        CR = XM_CRMASK_CR6FALSE;
 7094    }
 7095    return CR;
 7096#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7097    __n128 vResult = vceqq_f32( V1, V2 );
 7098    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7099    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7100    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
 7101
 7102    uint32_t CR = 0;
 7103    if ( r == 0xFFFFFFU )
 7104    {
 7105        CR = XM_CRMASK_CR6TRUE;
 7106    }
 7107    else if ( !r )
 7108    {
 7109        CR = XM_CRMASK_CR6FALSE;
 7110    }
 7111    return CR;
 7112#elif defined(_XM_SSE_INTRINSICS_)
 7113    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 7114    int iTest = _mm_movemask_ps(vTemp)&7;
 7115    uint32_t CR = 0;
 7116    if (iTest==7)
 7117    {
 7118        CR = XM_CRMASK_CR6TRUE;
 7119    }
 7120    else if (!iTest)
 7121    {
 7122        CR = XM_CRMASK_CR6FALSE;
 7123    }
 7124    return CR;
 7125#else // _XM_VMX128_INTRINSICS_
 7126#endif // _XM_VMX128_INTRINSICS_
 7127}
 7128
 7129//------------------------------------------------------------------------------
 7130
 7131inline bool XMVector3EqualInt
 7132(
 7133    FXMVECTOR V1, 
 7134    FXMVECTOR V2
 7135)
 7136{
 7137#if defined(_XM_NO_INTRINSICS_)
 7138    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
 7139#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7140    __n128 vResult = vceqq_u32( V1, V2 );
 7141    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7142    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7143    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7144#elif defined(_XM_SSE_INTRINSICS_)
 7145    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 7146    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0);
 7147#else // _XM_VMX128_INTRINSICS_
 7148    return XMComparisonAllTrue(XMVector3EqualIntR(V1, V2));
 7149#endif
 7150}
 7151
 7152//------------------------------------------------------------------------------
 7153
 7154inline uint32_t XMVector3EqualIntR
 7155(
 7156    FXMVECTOR V1, 
 7157    FXMVECTOR V2
 7158)
 7159{
 7160#if defined(_XM_NO_INTRINSICS_)
 7161    uint32_t CR = 0;
 7162    if ((V1.vector4_u32[0] == V2.vector4_u32[0]) && 
 7163        (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
 7164        (V1.vector4_u32[2] == V2.vector4_u32[2]))
 7165    {
 7166        CR = XM_CRMASK_CR6TRUE;
 7167    }
 7168    else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) && 
 7169        (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
 7170        (V1.vector4_u32[2] != V2.vector4_u32[2]))
 7171    {
 7172        CR = XM_CRMASK_CR6FALSE;
 7173    }
 7174    return CR;
 7175#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7176    __n128 vResult = vceqq_u32( V1, V2 );
 7177    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7178    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7179    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
 7180
 7181    uint32_t CR = 0;
 7182    if ( r == 0xFFFFFFU )
 7183    {
 7184        CR = XM_CRMASK_CR6TRUE;
 7185    }
 7186    else if ( !r )
 7187    {
 7188        CR = XM_CRMASK_CR6FALSE;
 7189    }
 7190    return CR;
 7191#elif defined(_XM_SSE_INTRINSICS_)
 7192    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 7193    int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7;
 7194    uint32_t CR = 0;
 7195    if (iTemp==7)
 7196    {
 7197        CR = XM_CRMASK_CR6TRUE;
 7198    }
 7199    else if (!iTemp)
 7200    {
 7201        CR = XM_CRMASK_CR6FALSE;
 7202    }
 7203    return CR;
 7204#else // _XM_VMX128_INTRINSICS_
 7205#endif // _XM_VMX128_INTRINSICS_
 7206}
 7207
 7208//------------------------------------------------------------------------------
 7209
 7210inline bool XMVector3NearEqual
 7211(
 7212    FXMVECTOR V1, 
 7213    FXMVECTOR V2, 
 7214    FXMVECTOR Epsilon
 7215)
 7216{
 7217#if defined(_XM_NO_INTRINSICS_)
 7218    float dx, dy, dz;
 7219
 7220    dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
 7221    dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
 7222    dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
 7223    return (((dx <= Epsilon.vector4_f32[0]) &&
 7224            (dy <= Epsilon.vector4_f32[1]) &&
 7225            (dz <= Epsilon.vector4_f32[2])) != 0);
 7226#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7227    __n128 vDelta = vsubq_f32( V1, V2 );
 7228    __n128 vResult = vacleq_f32( vDelta, Epsilon );
 7229    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7230    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7231    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7232#elif defined(_XM_SSE_INTRINSICS_)
 7233    // Get the difference
 7234    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
 7235    // Get the absolute value of the difference
 7236    XMVECTOR vTemp = _mm_setzero_ps();
 7237    vTemp = _mm_sub_ps(vTemp,vDelta);
 7238    vTemp = _mm_max_ps(vTemp,vDelta);
 7239    vTemp = _mm_cmple_ps(vTemp,Epsilon);
 7240    // w is don't care
 7241    return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0);
 7242#else // _XM_VMX128_INTRINSICS_
 7243#endif // _XM_VMX128_INTRINSICS_
 7244}
 7245
 7246//------------------------------------------------------------------------------
 7247
 7248inline bool XMVector3NotEqual
 7249(
 7250    FXMVECTOR V1, 
 7251    FXMVECTOR V2
 7252)
 7253{
 7254#if defined(_XM_NO_INTRINSICS_)
 7255    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
 7256#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7257    __n128 vResult = vceqq_f32( V1, V2 );
 7258    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7259    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7260    return ( (vget_lane_u32(vTemp.val[1], 1)  & 0xFFFFFFU) != 0xFFFFFFU );
 7261#elif defined(_XM_SSE_INTRINSICS_)
 7262    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 7263    return (((_mm_movemask_ps(vTemp)&7)!=7) != 0);
 7264#else // _XM_VMX128_INTRINSICS_
 7265    return XMComparisonAnyFalse(XMVector3EqualR(V1, V2));
 7266#endif
 7267}
 7268
 7269//------------------------------------------------------------------------------
 7270
 7271inline bool XMVector3NotEqualInt
 7272(
 7273    FXMVECTOR V1, 
 7274    FXMVECTOR V2
 7275)
 7276{
 7277#if defined(_XM_NO_INTRINSICS_)
 7278    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
 7279#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7280    __n128 vResult = vceqq_u32( V1, V2 );
 7281    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7282    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7283    return ( (vget_lane_u32(vTemp.val[1], 1)  & 0xFFFFFFU) != 0xFFFFFFU );
 7284#elif defined(_XM_SSE_INTRINSICS_)
 7285    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 7286    return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0);
 7287#else // _XM_VMX128_INTRINSICS_
 7288    return XMComparisonAnyFalse(XMVector3EqualIntR(V1, V2));
 7289#endif
 7290}
 7291
 7292//------------------------------------------------------------------------------
 7293
 7294inline bool XMVector3Greater
 7295(
 7296    FXMVECTOR V1, 
 7297    FXMVECTOR V2
 7298)
 7299{
 7300#if defined(_XM_NO_INTRINSICS_)
 7301    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
 7302#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7303    __n128 vResult = vcgtq_f32( V1, V2 );
 7304    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7305    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7306    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7307#elif defined(_XM_SSE_INTRINSICS_)
 7308    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
 7309    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
 7310#else // _XM_VMX128_INTRINSICS_
 7311    return XMComparisonAllTrue(XMVector3GreaterR(V1, V2));
 7312#endif
 7313}
 7314
 7315//------------------------------------------------------------------------------
 7316
 7317inline uint32_t XMVector3GreaterR
 7318(
 7319    FXMVECTOR V1, 
 7320    FXMVECTOR V2
 7321)
 7322{
 7323#if defined(_XM_NO_INTRINSICS_)
 7324    uint32_t CR = 0;
 7325    if ((V1.vector4_f32[0] > V2.vector4_f32[0]) && 
 7326        (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
 7327        (V1.vector4_f32[2] > V2.vector4_f32[2]))
 7328    {
 7329        CR = XM_CRMASK_CR6TRUE;
 7330    }
 7331    else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) && 
 7332        (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
 7333        (V1.vector4_f32[2] <= V2.vector4_f32[2]))
 7334    {
 7335        CR = XM_CRMASK_CR6FALSE;
 7336    }
 7337    return CR;
 7338
 7339#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7340    __n128 vResult = vcgtq_f32( V1, V2 );
 7341    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7342    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7343    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
 7344
 7345    uint32_t CR = 0;
 7346    if ( r == 0xFFFFFFU )
 7347    {
 7348        CR = XM_CRMASK_CR6TRUE;
 7349    }
 7350    else if ( !r )
 7351    {
 7352        CR = XM_CRMASK_CR6FALSE;
 7353    }
 7354    return CR;
 7355#elif defined(_XM_SSE_INTRINSICS_)
 7356    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
 7357    uint32_t CR = 0;
 7358    int iTest = _mm_movemask_ps(vTemp)&7;
 7359    if (iTest==7) 
 7360    {
 7361        CR =  XM_CRMASK_CR6TRUE;
 7362    }
 7363    else if (!iTest)
 7364    {
 7365        CR = XM_CRMASK_CR6FALSE;
 7366    }
 7367    return CR;
 7368#else // _XM_VMX128_INTRINSICS_
 7369#endif // _XM_VMX128_INTRINSICS_
 7370}
 7371
 7372//------------------------------------------------------------------------------
 7373
 7374inline bool XMVector3GreaterOrEqual
 7375(
 7376    FXMVECTOR V1, 
 7377    FXMVECTOR V2
 7378)
 7379{
 7380#if defined(_XM_NO_INTRINSICS_)
 7381    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
 7382#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7383    __n128 vResult = vcgeq_f32( V1, V2 );
 7384    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7385    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7386    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7387#elif defined(_XM_SSE_INTRINSICS_)
 7388    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
 7389    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
 7390#else // _XM_VMX128_INTRINSICS_
 7391    return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V1, V2));
 7392#endif
 7393}
 7394
 7395//------------------------------------------------------------------------------
 7396
 7397inline uint32_t XMVector3GreaterOrEqualR
 7398(
 7399    FXMVECTOR V1, 
 7400    FXMVECTOR V2
 7401)
 7402{
 7403#if defined(_XM_NO_INTRINSICS_)
 7404
 7405    uint32_t CR = 0;
 7406    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
 7407        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
 7408        (V1.vector4_f32[2] >= V2.vector4_f32[2]))
 7409    {
 7410        CR = XM_CRMASK_CR6TRUE;
 7411    }
 7412    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
 7413        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
 7414        (V1.vector4_f32[2] < V2.vector4_f32[2]))
 7415    {
 7416        CR = XM_CRMASK_CR6FALSE;
 7417    }
 7418    return CR;
 7419
 7420#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7421    __n128 vResult = vcgeq_f32( V1, V2 );
 7422    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7423    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7424    uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
 7425
 7426    uint32_t CR = 0;
 7427    if ( r == 0xFFFFFFU )
 7428    {
 7429        CR = XM_CRMASK_CR6TRUE;
 7430    }
 7431    else if ( !r )
 7432    {
 7433        CR = XM_CRMASK_CR6FALSE;
 7434    }
 7435    return CR;
 7436#elif defined(_XM_SSE_INTRINSICS_)
 7437    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
 7438    uint32_t CR = 0;
 7439    int iTest = _mm_movemask_ps(vTemp)&7;
 7440    if (iTest==7) 
 7441    {
 7442        CR =  XM_CRMASK_CR6TRUE;
 7443    }
 7444    else if (!iTest)
 7445    {
 7446        CR = XM_CRMASK_CR6FALSE;
 7447    }
 7448    return CR;
 7449#else // _XM_VMX128_INTRINSICS_
 7450#endif // _XM_VMX128_INTRINSICS_
 7451}
 7452
 7453//------------------------------------------------------------------------------
 7454
 7455inline bool XMVector3Less
 7456(
 7457    FXMVECTOR V1, 
 7458    FXMVECTOR V2
 7459)
 7460{
 7461#if defined(_XM_NO_INTRINSICS_)
 7462    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
 7463#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7464    __n128 vResult = vcltq_f32( V1, V2 );
 7465    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7466    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7467    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7468#elif defined(_XM_SSE_INTRINSICS_)
 7469    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
 7470    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
 7471#else // _XM_VMX128_INTRINSICS_
 7472    return XMComparisonAllTrue(XMVector3GreaterR(V2, V1));
 7473#endif
 7474}
 7475
 7476//------------------------------------------------------------------------------
 7477
 7478inline bool XMVector3LessOrEqual
 7479(
 7480    FXMVECTOR V1, 
 7481    FXMVECTOR V2
 7482)
 7483{
 7484#if defined(_XM_NO_INTRINSICS_)
 7485    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
 7486#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7487    __n128 vResult = vcleq_f32( V1, V2 );
 7488    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 7489    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7490    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7491#elif defined(_XM_SSE_INTRINSICS_)
 7492    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
 7493    return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
 7494#else // _XM_VMX128_INTRINSICS_
 7495    return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V2, V1));
 7496#endif
 7497}
 7498
 7499//------------------------------------------------------------------------------
 7500
 7501inline bool XMVector3InBounds
 7502(
 7503    FXMVECTOR V, 
 7504    FXMVECTOR Bounds
 7505)
 7506{
 7507#if defined(_XM_NO_INTRINSICS_)
 7508    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
 7509        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
 7510        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
 7511#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7512    // Test if less than or equal
 7513    __n128 vTemp1 = vcleq_f32(V,Bounds);
 7514    // Negate the bounds
 7515    __n128 vTemp2 = vnegq_f32(Bounds);
 7516    // Test if greater or equal (Reversed)
 7517    vTemp2 = vcleq_f32(vTemp2,V);
 7518    // Blend answers
 7519    vTemp1 = vandq_u32(vTemp1,vTemp2);
 7520    // in bounds?
 7521    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
 7522    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7523    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
 7524#elif defined(_XM_SSE_INTRINSICS_)
 7525    // Test if less than or equal
 7526    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
 7527    // Negate the bounds
 7528    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
 7529    // Test if greater or equal (Reversed)
 7530    vTemp2 = _mm_cmple_ps(vTemp2,V);
 7531    // Blend answers
 7532    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
 7533    // x,y and z in bounds? (w is don't care)
 7534    return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0);
 7535#else
 7536    return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
 7537#endif
 7538}
 7539
 7540
 7541//------------------------------------------------------------------------------
 7542
 7543inline bool XMVector3IsNaN
 7544(
 7545    FXMVECTOR V
 7546)
 7547{
 7548#if defined(_XM_NO_INTRINSICS_)
 7549
 7550    return (XMISNAN(V.vector4_f32[0]) ||
 7551            XMISNAN(V.vector4_f32[1]) ||
 7552            XMISNAN(V.vector4_f32[2]));
 7553
 7554#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7555    // Test against itself. NaN is always not equal
 7556    __n128 vTempNan = vceqq_f32( V, V );
 7557    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
 7558    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7559    // If x or y or z are NaN, the mask is zero
 7560    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
 7561#elif defined(_XM_SSE_INTRINSICS_)
 7562    // Test against itself. NaN is always not equal
 7563    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
 7564    // If x or y or z are NaN, the mask is non-zero
 7565    return ((_mm_movemask_ps(vTempNan)&7) != 0);
 7566#else // _XM_VMX128_INTRINSICS_
 7567#endif // _XM_VMX128_INTRINSICS_
 7568}
 7569
 7570//------------------------------------------------------------------------------
 7571
 7572inline bool XMVector3IsInfinite
 7573(
 7574    FXMVECTOR V
 7575)
 7576{
 7577#if defined(_XM_NO_INTRINSICS_)
 7578    return (XMISINF(V.vector4_f32[0]) ||
 7579            XMISINF(V.vector4_f32[1]) ||
 7580            XMISINF(V.vector4_f32[2]));
 7581#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7582    // Mask off the sign bit
 7583    __n128 vTempInf = vandq_u32( V, g_XMAbsMask );
 7584    // Compare to infinity
 7585    vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
 7586    // If any are infinity, the signs are true.
 7587    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
 7588    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 7589    return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 );
 7590#elif defined(_XM_SSE_INTRINSICS_)
 7591    // Mask off the sign bit
 7592    __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
 7593    // Compare to infinity
 7594    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
 7595    // If x,y or z are infinity, the signs are true.
 7596    return ((_mm_movemask_ps(vTemp)&7) != 0);
 7597#else // _XM_VMX128_INTRINSICS_
 7598#endif // _XM_VMX128_INTRINSICS_
 7599}
 7600
 7601//------------------------------------------------------------------------------
 7602// Computation operations
 7603//------------------------------------------------------------------------------
 7604
 7605//------------------------------------------------------------------------------
 7606
 7607inline XMVECTOR XMVector3Dot
 7608(
 7609    FXMVECTOR V1, 
 7610    FXMVECTOR V2
 7611)
 7612{
 7613#if defined(_XM_NO_INTRINSICS_)
 7614    float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
 7615    XMVECTOR vResult = {
 7616        fValue,
 7617        fValue,
 7618        fValue,
 7619        fValue
 7620    };            
 7621    return vResult;
 7622
 7623#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7624    __n128 vTemp = vmulq_f32( V1, V2 );
 7625    __n64 v1 = vget_low_f32( vTemp );
 7626    __n64 v2 = vget_high_f32( vTemp );
 7627    v1 = vpadd_f32( v1, v1 );
 7628    v2 = vdup_lane_f32( v2, 0 );
 7629    v1 = vadd_f32( v1, v2 );
 7630    return vcombine_f32( v1, v1 );
 7631#elif defined(_XM_SSE_INTRINSICS_)
 7632    // Perform the dot product
 7633    XMVECTOR vDot = _mm_mul_ps(V1,V2);
 7634    // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
 7635    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
 7636    // Result.vector4_f32[0] = x+y
 7637    vDot = _mm_add_ss(vDot,vTemp);
 7638    // x=Dot.vector4_f32[2]
 7639    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
 7640    // Result.vector4_f32[0] = (x+y)+z
 7641    vDot = _mm_add_ss(vDot,vTemp);
 7642    // Splat x
 7643    return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
 7644#else // _XM_VMX128_INTRINSICS_
 7645#endif // _XM_VMX128_INTRINSICS_
 7646}
 7647
 7648//------------------------------------------------------------------------------
 7649
 7650inline XMVECTOR XMVector3Cross
 7651(
 7652    FXMVECTOR V1, 
 7653    FXMVECTOR V2
 7654)
 7655{
 7656    // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
 7657
 7658#if defined(_XM_NO_INTRINSICS_)
 7659    XMVECTOR vResult = {
 7660        (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
 7661        (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
 7662        (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
 7663        0.0f
 7664    };
 7665    return vResult;
 7666#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7667    __n64 v1xy = vget_low_f32(V1);
 7668    __n64 v2xy = vget_low_f32(V2);
 7669
 7670    __n64 v1yx = vrev64_f32( v1xy );
 7671    __n64 v2yx = vrev64_f32( v2xy );
 7672
 7673    __n64 v1zz = vdup_lane_f32( vget_high_f32(V1), 0 );
 7674    __n64 v2zz = vdup_lane_f32( vget_high_f32(V2), 0 );
 7675
 7676    __n128 vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) );
 7677    vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) );
 7678    return veorq_u32( vResult, g_XMFlipY );
 7679#elif defined(_XM_SSE_INTRINSICS_)
 7680    // y1,z1,x1,w1
 7681    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1));
 7682    // z2,x2,y2,w2
 7683    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2));
 7684    // Perform the left operation
 7685    XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2);
 7686    // z1,x1,y1,w1
 7687    vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1));
 7688    // y2,z2,x2,w2
 7689    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2));
 7690    // Perform the right operation
 7691    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
 7692    // Subract the right from left, and return answer
 7693    vResult = _mm_sub_ps(vResult,vTemp1);
 7694    // Set w to zero
 7695    return _mm_and_ps(vResult,g_XMMask3);
 7696#else // _XM_VMX128_INTRINSICS_
 7697#endif // _XM_VMX128_INTRINSICS_
 7698}
 7699
 7700//------------------------------------------------------------------------------
 7701
 7702inline XMVECTOR XMVector3LengthSq
 7703(
 7704    FXMVECTOR V
 7705)
 7706{
 7707    return XMVector3Dot(V, V);
 7708}
 7709
 7710//------------------------------------------------------------------------------
 7711
 7712inline XMVECTOR XMVector3ReciprocalLengthEst
 7713(
 7714    FXMVECTOR V
 7715)
 7716{
 7717#if defined(_XM_NO_INTRINSICS_)
 7718
 7719    XMVECTOR Result;
 7720
 7721    Result = XMVector3LengthSq(V);
 7722    Result = XMVectorReciprocalSqrtEst(Result);
 7723
 7724    return Result;
 7725
 7726#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7727    // Dot3
 7728    __n128 vTemp = vmulq_f32( V, V );
 7729    __n64 v1 = vget_low_f32( vTemp );
 7730    __n64 v2 = vget_high_f32( vTemp );
 7731    v1 = vpadd_f32( v1, v1 );
 7732    v2 = vdup_lane_f32( v2, 0 );
 7733    v1 = vadd_f32( v1, v2 );
 7734    // Reciprocal sqrt (estimate)
 7735    v2 = vrsqrte_f32( v1 );
 7736    return vcombine_f32(v2, v2);
 7737#elif defined(_XM_SSE_INTRINSICS_)
 7738    // Perform the dot product on x,y and z
 7739    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 7740    // vTemp has z and y
 7741    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
 7742    // x+z, y
 7743    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 7744    // y,y,y,y
 7745    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
 7746    // x+z+y,??,??,??
 7747    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 7748    // Splat the length squared
 7749    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 7750    // Get the reciprocal
 7751    vLengthSq = _mm_rsqrt_ps(vLengthSq);
 7752    return vLengthSq;
 7753#else // _XM_VMX128_INTRINSICS_
 7754#endif // _XM_VMX128_INTRINSICS_
 7755}
 7756
 7757//------------------------------------------------------------------------------
 7758
 7759inline XMVECTOR XMVector3ReciprocalLength
 7760(
 7761    FXMVECTOR V
 7762)
 7763{
 7764#if defined(_XM_NO_INTRINSICS_)
 7765
 7766    XMVECTOR Result;
 7767
 7768    Result = XMVector3LengthSq(V);
 7769    Result = XMVectorReciprocalSqrt(Result);
 7770
 7771    return Result;
 7772
 7773#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7774    // Dot3
 7775    __n128 vTemp = vmulq_f32( V, V );
 7776    __n64 v1 = vget_low_f32( vTemp );
 7777    __n64 v2 = vget_high_f32( vTemp );
 7778    v1 = vpadd_f32( v1, v1 );
 7779    v2 = vdup_lane_f32( v2, 0 );
 7780    v1 = vadd_f32( v1, v2 );
 7781    // Reciprocal sqrt
 7782    __n64  S0 = vrsqrte_f32(v1);
 7783    __n64  P0 = vmul_f32( v1, S0 );
 7784    __n64  R0 = vrsqrts_f32( P0, S0 );
 7785    __n64  S1 = vmul_f32( S0, R0 );
 7786    __n64  P1 = vmul_f32( v1, S1 );
 7787    __n64  R1 = vrsqrts_f32( P1, S1 );
 7788    __n64 Result = vmul_f32( S1, R1 );
 7789    return vcombine_f32( Result, Result );
 7790#elif defined(_XM_SSE_INTRINSICS_)
 7791     // Perform the dot product
 7792    XMVECTOR vDot = _mm_mul_ps(V,V);
 7793    // x=Dot.y, y=Dot.z
 7794    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
 7795    // Result.x = x+y
 7796    vDot = _mm_add_ss(vDot,vTemp);
 7797    // x=Dot.z
 7798    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
 7799    // Result.x = (x+y)+z
 7800    vDot = _mm_add_ss(vDot,vTemp);
 7801    // Splat x
 7802    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
 7803    // Get the reciprocal
 7804    vDot = _mm_sqrt_ps(vDot);
 7805    // Get the reciprocal
 7806    vDot = _mm_div_ps(g_XMOne,vDot);
 7807    return vDot;
 7808#else // _XM_VMX128_INTRINSICS_
 7809#endif // _XM_VMX128_INTRINSICS_
 7810}
 7811
 7812//------------------------------------------------------------------------------
 7813
 7814inline XMVECTOR XMVector3LengthEst
 7815(
 7816    FXMVECTOR V
 7817)
 7818{
 7819#if defined(_XM_NO_INTRINSICS_)
 7820
 7821    XMVECTOR Result;
 7822
 7823    Result = XMVector3LengthSq(V);
 7824    Result = XMVectorSqrtEst(Result);
 7825
 7826    return Result;
 7827
 7828#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7829    // Dot3
 7830    __n128 vTemp = vmulq_f32( V, V );
 7831    __n64 v1 = vget_low_f32( vTemp );
 7832    __n64 v2 = vget_high_f32( vTemp );
 7833    v1 = vpadd_f32( v1, v1 );
 7834    v2 = vdup_lane_f32( v2, 0 );
 7835    v1 = vadd_f32( v1, v2 );
 7836    const __n64 zero = vdup_n_u32(0);
 7837    __n64 VEqualsZero = vceq_f32( v1, zero );
 7838    // Sqrt (estimate)
 7839    __n64 Result = vrsqrte_f32( v1 );
 7840    Result = vmul_f32( v1, Result );
 7841    Result = vbsl_f32( VEqualsZero, zero, Result );
 7842    return vcombine_f32( Result, Result );
 7843#elif defined(_XM_SSE_INTRINSICS_)
 7844    // Perform the dot product on x,y and z
 7845    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 7846    // vTemp has z and y
 7847    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
 7848    // x+z, y
 7849    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 7850    // y,y,y,y
 7851    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
 7852    // x+z+y,??,??,??
 7853    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 7854    // Splat the length squared
 7855    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 7856    // Get the length
 7857    vLengthSq = _mm_sqrt_ps(vLengthSq);
 7858    return vLengthSq;
 7859#else // _XM_VMX128_INTRINSICS_
 7860#endif // _XM_VMX128_INTRINSICS_
 7861}
 7862
 7863//------------------------------------------------------------------------------
 7864
 7865inline XMVECTOR XMVector3Length
 7866(
 7867    FXMVECTOR V
 7868)
 7869{
 7870#if defined(_XM_NO_INTRINSICS_)
 7871
 7872    XMVECTOR Result;
 7873
 7874    Result = XMVector3LengthSq(V);
 7875    Result = XMVectorSqrt(Result);
 7876
 7877    return Result;
 7878
 7879#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7880    // Dot3
 7881    __n128 vTemp = vmulq_f32( V, V );
 7882    __n64 v1 = vget_low_f32( vTemp );
 7883    __n64 v2 = vget_high_f32( vTemp );
 7884    v1 = vpadd_f32( v1, v1 );
 7885    v2 = vdup_lane_f32( v2, 0 );
 7886    v1 = vadd_f32( v1, v2 );
 7887    const __n64 zero = vdup_n_u32(0);
 7888    __n64 VEqualsZero = vceq_f32( v1, zero );
 7889    // Sqrt
 7890    __n64 S0 = vrsqrte_f32( v1 );
 7891    __n64 P0 = vmul_f32( v1, S0 );
 7892    __n64 R0 = vrsqrts_f32( P0, S0 );
 7893    __n64 S1 = vmul_f32( S0, R0 );
 7894    __n64 P1 = vmul_f32( v1, S1 );
 7895    __n64 R1 = vrsqrts_f32( P1, S1 );
 7896    __n64 Result = vmul_f32( S1, R1 );
 7897    Result = vmul_f32( v1, Result );
 7898    Result = vbsl_f32( VEqualsZero, zero, Result );
 7899    return vcombine_f32( Result, Result );
 7900#elif defined(_XM_SSE_INTRINSICS_)
 7901    // Perform the dot product on x,y and z
 7902    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 7903    // vTemp has z and y
 7904    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
 7905    // x+z, y
 7906    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 7907    // y,y,y,y
 7908    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
 7909    // x+z+y,??,??,??
 7910    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 7911    // Splat the length squared
 7912    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 7913    // Get the length
 7914    vLengthSq = _mm_sqrt_ps(vLengthSq);
 7915    return vLengthSq;
 7916#else // _XM_VMX128_INTRINSICS_
 7917#endif // _XM_VMX128_INTRINSICS_
 7918}
 7919
 7920//------------------------------------------------------------------------------
 7921// XMVector3NormalizeEst uses a reciprocal estimate and
 7922// returns QNaN on zero and infinite vectors.
 7923
 7924inline XMVECTOR XMVector3NormalizeEst
 7925(
 7926    FXMVECTOR V
 7927)
 7928{
 7929#if defined(_XM_NO_INTRINSICS_)
 7930
 7931    XMVECTOR Result;
 7932    Result = XMVector3ReciprocalLength(V);
 7933    Result = XMVectorMultiply(V, Result);
 7934    return Result;
 7935
 7936#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7937    // Dot3
 7938    __n128 vTemp = vmulq_f32( V, V );
 7939    __n64 v1 = vget_low_f32( vTemp );
 7940    __n64 v2 = vget_high_f32( vTemp );
 7941    v1 = vpadd_f32( v1, v1 );
 7942    v2 = vdup_lane_f32( v2, 0 );
 7943    v1 = vadd_f32( v1, v2 );
 7944    // Reciprocal sqrt (estimate)
 7945    v2 = vrsqrte_f32( v1 );
 7946    // Normalize
 7947    return vmulq_f32( V, vcombine_f32(v2,v2) );
 7948#elif defined(_XM_SSE_INTRINSICS_)
 7949     // Perform the dot product
 7950    XMVECTOR vDot = _mm_mul_ps(V,V);
 7951    // x=Dot.y, y=Dot.z
 7952    XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
 7953    // Result.x = x+y
 7954    vDot = _mm_add_ss(vDot,vTemp);
 7955    // x=Dot.z
 7956    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
 7957    // Result.x = (x+y)+z
 7958    vDot = _mm_add_ss(vDot,vTemp);
 7959    // Splat x
 7960    vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
 7961    // Get the reciprocal
 7962    vDot = _mm_rsqrt_ps(vDot);
 7963    // Perform the normalization
 7964    vDot = _mm_mul_ps(vDot,V);
 7965    return vDot;
 7966#else // _XM_VMX128_INTRINSICS_
 7967#endif // _XM_VMX128_INTRINSICS_
 7968}
 7969
 7970//------------------------------------------------------------------------------
 7971
 7972inline XMVECTOR XMVector3Normalize
 7973(
 7974    FXMVECTOR V
 7975)
 7976{
 7977#if defined(_XM_NO_INTRINSICS_)
 7978    float fLength;
 7979    XMVECTOR vResult;
 7980
 7981    vResult = XMVector3Length( V );
 7982    fLength = vResult.vector4_f32[0];
 7983
 7984    // Prevent divide by zero
 7985    if (fLength > 0) {
 7986        fLength = 1.0f/fLength;
 7987    }
 7988    
 7989    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
 7990    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
 7991    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
 7992    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
 7993    return vResult;
 7994
 7995#elif defined(_XM_ARM_NEON_INTRINSICS_)
 7996    // Dot3
 7997    __n128 vTemp = vmulq_f32( V, V );
 7998    __n64 v1 = vget_low_f32( vTemp );
 7999    __n64 v2 = vget_high_f32( vTemp );
 8000    v1 = vpadd_f32( v1, v1 );
 8001    v2 = vdup_lane_f32( v2, 0 );
 8002    v1 = vadd_f32( v1, v2 );
 8003    __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) );
 8004    __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
 8005    // Reciprocal sqrt (2 iterations of Newton-Raphson)
 8006    __n64 S0 = vrsqrte_f32( v1 );
 8007    __n64 P0 = vmul_f32( v1, S0 );
 8008    __n64 R0 = vrsqrts_f32( P0, S0 );
 8009    __n64 S1 = vmul_f32( S0, R0 );
 8010    __n64 P1 = vmul_f32( v1, S1 );
 8011    __n64 R1 = vrsqrts_f32( P1, S1 );
 8012    v2 = vmul_f32( S1, R1 );
 8013    // Normalize
 8014    __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
 8015    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
 8016    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
 8017#elif defined(_XM_SSE_INTRINSICS_)
 8018    // Perform the dot product on x,y and z only
 8019    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 8020    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
 8021    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 8022    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
 8023    vLengthSq = _mm_add_ss(vLengthSq,vTemp);
 8024    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
 8025    // Prepare for the division
 8026    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
 8027    // Create zero with a single instruction
 8028    XMVECTOR vZeroMask = _mm_setzero_ps();
 8029    // Test for a divide by zero (Must be FP to detect -0.0)
 8030    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
 8031    // Failsafe on zero (Or epsilon) length planes
 8032    // If the length is infinity, set the elements to zero
 8033    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
 8034    // Divide to perform the normalization
 8035    vResult = _mm_div_ps(V,vResult);
 8036    // Any that are infinity, set to zero
 8037    vResult = _mm_and_ps(vResult,vZeroMask);
 8038    // Select qnan or result based on infinite length
 8039    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
 8040    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
 8041    vResult = _mm_or_ps(vTemp1,vTemp2);
 8042    return vResult;
 8043#else // _XM_VMX128_INTRINSICS_
 8044#endif // _XM_VMX128_INTRINSICS_
 8045}
 8046
 8047//------------------------------------------------------------------------------
 8048
 8049inline XMVECTOR XMVector3ClampLength
 8050(
 8051    FXMVECTOR V, 
 8052    float    LengthMin, 
 8053    float    LengthMax
 8054)
 8055{
 8056    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
 8057    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
 8058
 8059    return XMVector3ClampLengthV(V, ClampMin, ClampMax);
 8060}
 8061
 8062//------------------------------------------------------------------------------
 8063
 8064inline XMVECTOR XMVector3ClampLengthV
 8065(
 8066    FXMVECTOR V, 
 8067    FXMVECTOR LengthMin, 
 8068    FXMVECTOR LengthMax
 8069)
 8070{
 8071    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
 8072    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
 8073    assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
 8074    assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
 8075    assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));
 8076
 8077    XMVECTOR LengthSq = XMVector3LengthSq(V);
 8078
 8079    const XMVECTOR Zero = XMVectorZero();
 8080
 8081    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
 8082
 8083    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
 8084    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
 8085
 8086    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
 8087
 8088    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
 8089
 8090    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
 8091    Length = XMVectorSelect(LengthSq, Length, Select);
 8092    Normal = XMVectorSelect(LengthSq, Normal, Select);
 8093
 8094    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
 8095    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
 8096
 8097    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
 8098    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
 8099
 8100    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
 8101
 8102    // Preserve the original vector (with no precision loss) if the length falls within the given range
 8103    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
 8104    Result = XMVectorSelect(Result, V, Control);
 8105
 8106    return Result;
 8107}
 8108
 8109//------------------------------------------------------------------------------
 8110
 8111inline XMVECTOR XMVector3Reflect
 8112(
 8113    FXMVECTOR Incident, 
 8114    FXMVECTOR Normal
 8115)
 8116{
 8117    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
 8118
 8119    XMVECTOR Result = XMVector3Dot(Incident, Normal);
 8120    Result = XMVectorAdd(Result, Result);
 8121    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
 8122
 8123    return Result;
 8124}
 8125
 8126//------------------------------------------------------------------------------
 8127
 8128inline XMVECTOR XMVector3Refract
 8129(
 8130    FXMVECTOR Incident, 
 8131    FXMVECTOR Normal, 
 8132    float    RefractionIndex
 8133)
 8134{
 8135    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
 8136    return XMVector3RefractV(Incident, Normal, Index);
 8137}
 8138
 8139//------------------------------------------------------------------------------
 8140
 8141inline XMVECTOR XMVector3RefractV
 8142(
 8143    FXMVECTOR Incident, 
 8144    FXMVECTOR Normal, 
 8145    FXMVECTOR RefractionIndex
 8146)
 8147{
 8148    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
 8149    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
 8150
 8151#if defined(_XM_NO_INTRINSICS_)
 8152
 8153    const XMVECTOR  Zero = XMVectorZero();
 8154
 8155    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
 8156
 8157    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
 8158    XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
 8159    R = XMVectorMultiply(R, RefractionIndex);
 8160    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
 8161
 8162    if (XMVector4LessOrEqual(R, Zero))
 8163    {
 8164        // Total internal reflection
 8165        return Zero;
 8166    }
 8167    else
 8168    {
 8169        // R = RefractionIndex * IDotN + sqrt(R)
 8170        R = XMVectorSqrt(R);
 8171        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
 8172
 8173        // Result = RefractionIndex * Incident - Normal * R
 8174        XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
 8175        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
 8176
 8177        return Result;
 8178    }
 8179
 8180#elif defined(_XM_ARM_NEON_INTRINSICS_)
 8181    XMVECTOR IDotN = XMVector3Dot(Incident,Normal);
 8182
 8183    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
 8184    __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN);
 8185    R = vmulq_f32(R, RefractionIndex);
 8186    R = vmlsq_f32(g_XMOne, R, RefractionIndex );
 8187
 8188    __n128 vResult = vcleq_f32(R,g_XMZero);
 8189    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 8190    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 8191    if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
 8192    {
 8193        // Total internal reflection
 8194        vResult = g_XMZero;
 8195    }
 8196    else
 8197    {
 8198        // Sqrt(R)
 8199        __n128 S0 = vrsqrteq_f32(R);
 8200        __n128 P0 = vmulq_f32( R, S0 );
 8201        __n128 R0 = vrsqrtsq_f32( P0, S0 );
 8202        __n128 S1 = vmulq_f32( S0, R0 );
 8203        __n128 P1 = vmulq_f32( R, S1 );
 8204        __n128 R1 = vrsqrtsq_f32( P1, S1 );
 8205        __n128 S2 = vmulq_f32( S1, R1 );
 8206        R = vmulq_f32( R, S2 );
 8207        // R = RefractionIndex * IDotN + sqrt(R)
 8208        R = vmlaq_f32( R, RefractionIndex, IDotN );
 8209        // Result = RefractionIndex * Incident - Normal * R
 8210        vResult = vmulq_f32(RefractionIndex, Incident);
 8211        vResult = vmlsq_f32( vResult, R, Normal );
 8212    }
 8213    return vResult;
 8214#elif defined(_XM_SSE_INTRINSICS_)
 8215    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
 8216    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
 8217    XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
 8218    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
 8219    XMVECTOR R = _mm_mul_ps(IDotN, IDotN);
 8220    R = _mm_sub_ps(g_XMOne,R);
 8221    R = _mm_mul_ps(R, RefractionIndex);
 8222    R = _mm_mul_ps(R, RefractionIndex);
 8223    R = _mm_sub_ps(g_XMOne,R);
 8224
 8225    XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
 8226    if (_mm_movemask_ps(vResult)==0x0f)
 8227    {
 8228        // Total internal reflection
 8229        vResult = g_XMZero;
 8230    }
 8231    else
 8232    {
 8233        // R = RefractionIndex * IDotN + sqrt(R)
 8234        R = _mm_sqrt_ps(R);
 8235        vResult = _mm_mul_ps(RefractionIndex,IDotN);
 8236        R = _mm_add_ps(R,vResult);
 8237        // Result = RefractionIndex * Incident - Normal * R
 8238        vResult = _mm_mul_ps(RefractionIndex, Incident);
 8239        R = _mm_mul_ps(R,Normal);
 8240        vResult = _mm_sub_ps(vResult,R);
 8241    }
 8242    return vResult;
 8243#else // _XM_VMX128_INTRINSICS_
 8244#endif // _XM_VMX128_INTRINSICS_
 8245}
 8246
 8247//------------------------------------------------------------------------------
 8248
 8249inline XMVECTOR XMVector3Orthogonal
 8250(
 8251    FXMVECTOR V
 8252)
 8253{
 8254#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8255
 8256    XMVECTOR Zero = XMVectorZero();
 8257    XMVECTOR Z = XMVectorSplatZ(V);
 8258    XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V);
 8259
 8260    XMVECTOR NegativeV = XMVectorSubtract(Zero, V);
 8261
 8262    XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
 8263    XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);
 8264
 8265    XMVECTOR S = XMVectorAdd(YZYY, Z);
 8266    XMVECTOR D = XMVectorSubtract(YZYY, Z);
 8267
 8268    XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
 8269
 8270    XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S);
 8271    XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D);
 8272
 8273    return XMVectorSelect(R1, R0, Select);
 8274
 8275#else // _XM_VMX128_INTRINSICS_
 8276#endif // _XM_VMX128_INTRINSICS_
 8277}
 8278
 8279//------------------------------------------------------------------------------
 8280
 8281inline XMVECTOR XMVector3AngleBetweenNormalsEst
 8282(
 8283    FXMVECTOR N1, 
 8284    FXMVECTOR N2
 8285)
 8286{
 8287#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8288
 8289    XMVECTOR Result = XMVector3Dot(N1, N2);
 8290    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
 8291    Result = XMVectorACosEst(Result);
 8292    return Result;
 8293
 8294#else // _XM_VMX128_INTRINSICS_
 8295#endif // _XM_VMX128_INTRINSICS_
 8296}
 8297
 8298//------------------------------------------------------------------------------
 8299
 8300inline XMVECTOR XMVector3AngleBetweenNormals
 8301(
 8302    FXMVECTOR N1, 
 8303    FXMVECTOR N2
 8304)
 8305{
 8306#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8307
 8308    XMVECTOR Result = XMVector3Dot(N1, N2);
 8309    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
 8310    Result = XMVectorACos(Result);
 8311    return Result;
 8312
 8313#else // _XM_VMX128_INTRINSICS_
 8314#endif // _XM_VMX128_INTRINSICS_
 8315}
 8316
 8317//------------------------------------------------------------------------------
 8318
 8319inline XMVECTOR XMVector3AngleBetweenVectors
 8320(
 8321    FXMVECTOR V1, 
 8322    FXMVECTOR V2
 8323)
 8324{
 8325#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8326
 8327    XMVECTOR L1 = XMVector3ReciprocalLength(V1);
 8328    XMVECTOR L2 = XMVector3ReciprocalLength(V2);
 8329
 8330    XMVECTOR Dot = XMVector3Dot(V1, V2);
 8331
 8332    L1 = XMVectorMultiply(L1, L2);
 8333
 8334    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
 8335    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
 8336
 8337    return XMVectorACos(CosAngle);
 8338
 8339#else // _XM_VMX128_INTRINSICS_
 8340#endif // _XM_VMX128_INTRINSICS_
 8341}
 8342
 8343//------------------------------------------------------------------------------
 8344
 8345inline XMVECTOR XMVector3LinePointDistance
 8346(
 8347    FXMVECTOR LinePoint1, 
 8348    FXMVECTOR LinePoint2, 
 8349    FXMVECTOR Point
 8350)
 8351{
 8352    // Given a vector PointVector from LinePoint1 to Point and a vector
 8353    // LineVector from LinePoint1 to LinePoint2, the scaled distance 
 8354    // PointProjectionScale from LinePoint1 to the perpendicular projection
 8355    // of PointVector onto the line is defined as:
 8356    //
 8357    //     PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
 8358
 8359#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8360
 8361    XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
 8362    XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
 8363
 8364    XMVECTOR LengthSq = XMVector3LengthSq(LineVector);
 8365
 8366    XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
 8367    PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
 8368
 8369    XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
 8370    DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
 8371
 8372    return XMVector3Length(DistanceVector);
 8373
 8374#else // _XM_VMX128_INTRINSICS_
 8375#endif // _XM_VMX128_INTRINSICS_
 8376}
 8377
 8378//------------------------------------------------------------------------------
 8379
 8380_Use_decl_annotations_
 8381inline void XMVector3ComponentsFromNormal
 8382(
 8383    XMVECTOR* pParallel, 
 8384    XMVECTOR* pPerpendicular, 
 8385    FXMVECTOR  V, 
 8386    FXMVECTOR  Normal
 8387)
 8388{
 8389    assert(pParallel != NULL);
 8390    assert(pPerpendicular != NULL);
 8391
 8392    XMVECTOR Scale = XMVector3Dot(V, Normal);
 8393
 8394    XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);
 8395
 8396    *pParallel = Parallel;
 8397    *pPerpendicular = XMVectorSubtract(V, Parallel);
 8398}
 8399
 8400//------------------------------------------------------------------------------
 8401// Transform a vector using a rotation expressed as a unit quaternion
 8402
 8403inline XMVECTOR XMVector3Rotate
 8404(
 8405    FXMVECTOR V, 
 8406    FXMVECTOR RotationQuaternion
 8407)
 8408{
 8409#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8410
 8411    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
 8412    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
 8413    XMVECTOR Result = XMQuaternionMultiply(Q, A);
 8414    return XMQuaternionMultiply(Result, RotationQuaternion);
 8415
 8416#else // _XM_VMX128_INTRINSICS_
 8417#endif // _XM_VMX128_INTRINSICS_
 8418}
 8419
 8420//------------------------------------------------------------------------------
 8421// Transform a vector using the inverse of a rotation expressed as a unit quaternion
 8422
 8423inline XMVECTOR XMVector3InverseRotate
 8424(
 8425    FXMVECTOR V, 
 8426    FXMVECTOR RotationQuaternion
 8427)
 8428{
 8429#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8430
 8431    XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
 8432    XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
 8433    XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
 8434    return XMQuaternionMultiply(Result, Q);
 8435
 8436#else // _XM_VMX128_INTRINSICS_
 8437#endif // _XM_VMX128_INTRINSICS_
 8438}
 8439
 8440//------------------------------------------------------------------------------
 8441
 8442inline XMVECTOR XMVector3Transform
 8443(
 8444    FXMVECTOR V, 
 8445    CXMMATRIX M
 8446)
 8447{
 8448#if defined(_XM_NO_INTRINSICS_)
 8449
 8450    XMVECTOR Z = XMVectorSplatZ(V);
 8451    XMVECTOR Y = XMVectorSplatY(V);
 8452    XMVECTOR X = XMVectorSplatX(V);
 8453
 8454    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
 8455    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
 8456    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
 8457
 8458    return Result;
 8459
 8460#elif defined(_XM_ARM_NEON_INTRINSICS_)
 8461    __n64 VL = vget_low_f32( V );
 8462    XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X
 8463    XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y
 8464    vResult = vmlaq_f32( M.r[3], vResult, M.r[0] );
 8465    vResult = vmlaq_f32( vResult, vTemp, M.r[1] );
 8466    vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z
 8467    return vmlaq_f32( vResult, vTemp, M.r[2] );
 8468#elif defined(_XM_SSE_INTRINSICS_)
 8469    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
 8470    vResult = _mm_mul_ps(vResult,M.r[0]);
 8471    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
 8472    vTemp = _mm_mul_ps(vTemp,M.r[1]);
 8473    vResult = _mm_add_ps(vResult,vTemp);
 8474    vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
 8475    vTemp = _mm_mul_ps(vTemp,M.r[2]);
 8476    vResult = _mm_add_ps(vResult,vTemp);
 8477    vResult = _mm_add_ps(vResult,M.r[3]);
 8478    return vResult;
 8479#else // _XM_VMX128_INTRINSICS_
 8480#endif // _XM_VMX128_INTRINSICS_
 8481}
 8482
 8483//------------------------------------------------------------------------------
 8484
 8485_Use_decl_annotations_
 8486inline XMFLOAT4* XMVector3TransformStream
 8487(
 8488    XMFLOAT4*       pOutputStream, 
 8489    size_t          OutputStride, 
 8490    const XMFLOAT3* pInputStream, 
 8491    size_t          InputStride, 
 8492    size_t          VectorCount, 
 8493    CXMMATRIX       M
 8494)
 8495{
 8496    assert(pOutputStream != NULL);
 8497    assert(pInputStream != NULL);
 8498
 8499#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8500
 8501    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 8502    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
 8503
 8504    const XMVECTOR row0 = M.r[0];
 8505    const XMVECTOR row1 = M.r[1];
 8506    const XMVECTOR row2 = M.r[2];
 8507    const XMVECTOR row3 = M.r[3];
 8508
 8509    for (size_t i = 0; i < VectorCount; i++)
 8510    {
 8511        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
 8512        XMVECTOR Z = XMVectorSplatZ(V);
 8513        XMVECTOR Y = XMVectorSplatY(V);
 8514        XMVECTOR X = XMVectorSplatX(V);
 8515
 8516        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
 8517        Result = XMVectorMultiplyAdd(Y, row1, Result);
 8518        Result = XMVectorMultiplyAdd(X, row0, Result);
 8519
 8520        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
 8521
 8522        pInputVector += InputStride; 
 8523        pOutputVector += OutputStride;
 8524    }
 8525
 8526    return pOutputStream;
 8527
 8528#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 8529#endif // _XM_VMX128_INTRINSICS_
 8530}
 8531
 8532
 8533//------------------------------------------------------------------------------
 8534
 8535inline XMVECTOR XMVector3TransformCoord
 8536(
 8537    FXMVECTOR V, 
 8538    CXMMATRIX M
 8539)
 8540{
 8541#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8542
 8543    XMVECTOR Z = XMVectorSplatZ(V);
 8544    XMVECTOR Y = XMVectorSplatY(V);
 8545    XMVECTOR X = XMVectorSplatX(V);
 8546
 8547    XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
 8548    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
 8549    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
 8550
 8551    XMVECTOR W = XMVectorSplatW(Result);
 8552    return XMVectorDivide( Result, W );
 8553
 8554#else // _XM_VMX128_INTRINSICS_
 8555#endif // _XM_VMX128_INTRINSICS_
 8556}
 8557
 8558//------------------------------------------------------------------------------
 8559
 8560_Use_decl_annotations_
 8561inline XMFLOAT3* XMVector3TransformCoordStream
 8562(
 8563    XMFLOAT3*       pOutputStream, 
 8564    size_t          OutputStride, 
 8565    const XMFLOAT3* pInputStream, 
 8566    size_t          InputStride, 
 8567    size_t          VectorCount, 
 8568    CXMMATRIX       M
 8569)
 8570{
 8571    assert(pOutputStream != NULL);
 8572    assert(pInputStream != NULL);
 8573
 8574#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8575
 8576    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 8577    uint8_t*    pOutputVector = (uint8_t*)pOutputStream;
 8578
 8579    const XMVECTOR row0 = M.r[0];
 8580    const XMVECTOR row1 = M.r[1];
 8581    const XMVECTOR row2 = M.r[2];
 8582    const XMVECTOR row3 = M.r[3];
 8583
 8584    for (size_t i = 0; i < VectorCount; i++)
 8585    {
 8586        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
 8587        XMVECTOR Z = XMVectorSplatZ(V);
 8588        XMVECTOR Y = XMVectorSplatY(V);
 8589        XMVECTOR X = XMVectorSplatX(V);
 8590
 8591        XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
 8592        Result = XMVectorMultiplyAdd(Y, row1, Result);
 8593        Result = XMVectorMultiplyAdd(X, row0, Result);
 8594
 8595        XMVECTOR W = XMVectorSplatW(Result);
 8596
 8597        Result = XMVectorDivide(Result, W);
 8598
 8599        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
 8600
 8601        pInputVector += InputStride; 
 8602        pOutputVector += OutputStride;
 8603    }
 8604
 8605    return pOutputStream;
 8606
 8607#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 8608#endif // _XM_VMX128_INTRINSICS_
 8609}
 8610
 8611//------------------------------------------------------------------------------
 8612
 8613inline XMVECTOR XMVector3TransformNormal
 8614(
 8615    FXMVECTOR V, 
 8616    CXMMATRIX M
 8617)
 8618{
 8619#if defined(_XM_NO_INTRINSICS_)
 8620
 8621    XMVECTOR Z = XMVectorSplatZ(V);
 8622    XMVECTOR Y = XMVectorSplatY(V);
 8623    XMVECTOR X = XMVectorSplatX(V);
 8624
 8625    XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
 8626    Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
 8627    Result = XMVectorMultiplyAdd(X, M.r[0], Result);
 8628
 8629    return Result;
 8630
 8631#elif defined(_XM_ARM_NEON_INTRINSICS_)
 8632    __n64 VL = vget_low_f32( V );
 8633    XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X
 8634    XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y
 8635    vResult = vmulq_f32( vResult, M.r[0] );
 8636    vResult = vmlaq_f32( vResult, vTemp, M.r[1] );
 8637    vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z
 8638    return vmlaq_f32( vResult, vTemp, M.r[2] );
 8639#elif defined(_XM_SSE_INTRINSICS_)
 8640    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
 8641    vResult = _mm_mul_ps(vResult,M.r[0]);
 8642    XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
 8643    vTemp = _mm_mul_ps(vTemp,M.r[1]);
 8644    vResult = _mm_add_ps(vResult,vTemp);
 8645    vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
 8646    vTemp = _mm_mul_ps(vTemp,M.r[2]);
 8647    vResult = _mm_add_ps(vResult,vTemp);
 8648    return vResult;
 8649#else // _XM_VMX128_INTRINSICS_
 8650#endif // _XM_VMX128_INTRINSICS_
 8651}
 8652
 8653//------------------------------------------------------------------------------
 8654
 8655_Use_decl_annotations_
 8656inline XMFLOAT3* XMVector3TransformNormalStream
 8657(
 8658    XMFLOAT3*       pOutputStream, 
 8659    size_t          OutputStride, 
 8660    const XMFLOAT3* pInputStream, 
 8661    size_t          InputStride, 
 8662    size_t          VectorCount, 
 8663    CXMMATRIX       M
 8664)
 8665{
 8666    assert(pOutputStream != NULL);
 8667    assert(pInputStream != NULL);
 8668
 8669#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8670
 8671    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 8672    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
 8673
 8674    const XMVECTOR row0 = M.r[0];
 8675    const XMVECTOR row1 = M.r[1];
 8676    const XMVECTOR row2 = M.r[2];
 8677
 8678    for (size_t i = 0; i < VectorCount; i++)
 8679    {
 8680        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
 8681        XMVECTOR Z = XMVectorSplatZ(V);
 8682        XMVECTOR Y = XMVectorSplatY(V);
 8683        XMVECTOR X = XMVectorSplatX(V);
 8684
 8685        XMVECTOR Result = XMVectorMultiply(Z, row2);
 8686        Result = XMVectorMultiplyAdd(Y, row1, Result);
 8687        Result = XMVectorMultiplyAdd(X, row0, Result);
 8688
 8689        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
 8690
 8691        pInputVector += InputStride; 
 8692        pOutputVector += OutputStride;
 8693    }
 8694
 8695    return pOutputStream;
 8696
 8697#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 8698#endif // _XM_VMX128_INTRINSICS_
 8699}
 8700
 8701//------------------------------------------------------------------------------
 8702
 8703inline XMVECTOR XMVector3Project
 8704(
 8705    FXMVECTOR V, 
 8706    float    ViewportX, 
 8707    float    ViewportY, 
 8708    float    ViewportWidth, 
 8709    float    ViewportHeight, 
 8710    float    ViewportMinZ, 
 8711    float    ViewportMaxZ, 
 8712    CXMMATRIX Projection, 
 8713    CXMMATRIX View, 
 8714    CXMMATRIX World
 8715)
 8716{
 8717#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8718
 8719    const float HalfViewportWidth = ViewportWidth * 0.5f;
 8720    const float HalfViewportHeight = ViewportHeight * 0.5f;
 8721
 8722    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
 8723    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
 8724
 8725    XMMATRIX Transform = XMMatrixMultiply(World, View);
 8726    Transform = XMMatrixMultiply(Transform, Projection);
 8727
 8728    XMVECTOR Result = XMVector3TransformCoord(V, Transform);
 8729
 8730    Result = XMVectorMultiplyAdd(Result, Scale, Offset);
 8731
 8732    return Result;
 8733
 8734#else // _XM_VMX128_INTRINSICS_
 8735#endif // _XM_VMX128_INTRINSICS_
 8736}
 8737
 8738//------------------------------------------------------------------------------
 8739
 8740_Use_decl_annotations_
 8741inline XMFLOAT3* XMVector3ProjectStream
 8742(
 8743    XMFLOAT3*       pOutputStream, 
 8744    size_t          OutputStride, 
 8745    const XMFLOAT3* pInputStream, 
 8746    size_t          InputStride, 
 8747    size_t          VectorCount, 
 8748    float           ViewportX, 
 8749    float           ViewportY, 
 8750    float           ViewportWidth, 
 8751    float           ViewportHeight, 
 8752    float           ViewportMinZ, 
 8753    float           ViewportMaxZ, 
 8754    CXMMATRIX     Projection, 
 8755    CXMMATRIX     View, 
 8756    CXMMATRIX     World
 8757)
 8758{
 8759    assert(pOutputStream != NULL);
 8760    assert(pInputStream != NULL);
 8761
 8762#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 8763
 8764    const float HalfViewportWidth = ViewportWidth * 0.5f;
 8765    const float HalfViewportHeight = ViewportHeight * 0.5f;
 8766
 8767    XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
 8768    XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
 8769
 8770    XMMATRIX Transform = XMMatrixMultiply(World, View);
 8771    Transform = XMMatrixMultiply(Transform, Projection);
 8772
 8773    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 8774    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
 8775
 8776    for (size_t i = 0; i < VectorCount; i++)
 8777    {
 8778        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
 8779
 8780        XMVECTOR Result = XMVector3TransformCoord(V, Transform);
 8781        Result = XMVectorMultiplyAdd(Result, Scale, Offset);
 8782
 8783        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
 8784
 8785        pInputVector += InputStride; 
 8786        pOutputVector += OutputStride;
 8787    }
 8788
 8789    return pOutputStream;
 8790
 8791#else // _XM_VMX128_INTRINSICS_
 8792#endif // _XM_VMX128_INTRINSICS_
 8793}
 8794
 8795//------------------------------------------------------------------------------
 8796
 8797inline XMVECTOR XMVector3Unproject
 8798(
 8799    FXMVECTOR V, 
 8800    float     ViewportX, 
 8801    float     ViewportY, 
 8802    float     ViewportWidth, 
 8803    float     ViewportHeight, 
 8804    float     ViewportMinZ, 
 8805    float     ViewportMaxZ, 
 8806    CXMMATRIX Projection, 
 8807    CXMMATRIX View, 
 8808    CXMMATRIX World
 8809)
 8810{
 8811#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 8812
 8813    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
 8814
 8815    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
 8816    Scale = XMVectorReciprocal(Scale);
 8817
 8818    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
 8819    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
 8820
 8821    XMMATRIX Transform = XMMatrixMultiply(World, View);
 8822    Transform = XMMatrixMultiply(Transform, Projection);
 8823    Transform = XMMatrixInverse(NULL, Transform);
 8824
 8825    XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
 8826
 8827    return XMVector3TransformCoord(Result, Transform);
 8828
 8829#else // _XM_VMX128_INTRINSICS_
 8830#endif // _XM_VMX128_INTRINSICS_
 8831}
 8832
 8833//------------------------------------------------------------------------------
 8834
 8835_Use_decl_annotations_
 8836inline XMFLOAT3* XMVector3UnprojectStream
 8837(
 8838    XMFLOAT3*       pOutputStream, 
 8839    size_t          OutputStride, 
 8840    const XMFLOAT3* pInputStream, 
 8841    size_t          InputStride, 
 8842    size_t          VectorCount, 
 8843    float           ViewportX, 
 8844    float           ViewportY, 
 8845    float           ViewportWidth, 
 8846    float           ViewportHeight, 
 8847    float           ViewportMinZ, 
 8848    float           ViewportMaxZ, 
 8849    CXMMATRIX       Projection, 
 8850    CXMMATRIX       View, 
 8851    CXMMATRIX       World)
 8852{
 8853    assert(pOutputStream != NULL);
 8854    assert(pInputStream != NULL);
 8855
 8856#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_)
 8857
 8858    static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
 8859
 8860    XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
 8861    Scale = XMVectorReciprocal(Scale);
 8862
 8863    XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
 8864    Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
 8865
 8866    XMMATRIX Transform = XMMatrixMultiply(World, View);
 8867    Transform = XMMatrixMultiply(Transform, Projection);
 8868    Transform = XMMatrixInverse(NULL, Transform);
 8869
 8870    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
 8871    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
 8872
 8873    for (size_t i = 0; i < VectorCount; i++)
 8874    {
 8875        XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
 8876
 8877        XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
 8878
 8879        Result = XMVector3TransformCoord(Result, Transform);
 8880
 8881        XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
 8882
 8883        pInputVector += InputStride; 
 8884        pOutputVector += OutputStride;
 8885    }
 8886
 8887    return pOutputStream;
 8888
 8889#else // _XM_VMX128_INTRINSICS_
 8890#endif // _XM_VMX128_INTRINSICS_
 8891}
 8892
 8893/****************************************************************************
 8894 *
 8895 * 4D Vector
 8896 *
 8897 ****************************************************************************/
 8898
 8899//------------------------------------------------------------------------------
 8900// Comparison operations
 8901//------------------------------------------------------------------------------
 8902
 8903//------------------------------------------------------------------------------
 8904
 8905inline bool XMVector4Equal
 8906(
 8907    FXMVECTOR V1, 
 8908    FXMVECTOR V2
 8909)
 8910{
 8911#if defined(_XM_NO_INTRINSICS_)
 8912    return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
 8913#elif defined(_XM_ARM_NEON_INTRINSICS_)
 8914    __n128 vResult = vceqq_f32( V1, V2 );
 8915    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 8916    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 8917    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 8918#elif defined(_XM_SSE_INTRINSICS_)
 8919    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 8920    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
 8921#else
 8922    return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
 8923#endif
 8924}
 8925
 8926//------------------------------------------------------------------------------
 8927
 8928inline uint32_t XMVector4EqualR
 8929(
 8930    FXMVECTOR V1, 
 8931    FXMVECTOR V2
 8932)
 8933{
 8934#if defined(_XM_NO_INTRINSICS_)
 8935
 8936    uint32_t CR = 0;
 8937
 8938    if ((V1.vector4_f32[0] == V2.vector4_f32[0]) && 
 8939        (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
 8940        (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
 8941        (V1.vector4_f32[3] == V2.vector4_f32[3]))
 8942    {
 8943        CR = XM_CRMASK_CR6TRUE;
 8944    }
 8945    else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) && 
 8946        (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
 8947        (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
 8948        (V1.vector4_f32[3] != V2.vector4_f32[3]))
 8949    {
 8950        CR = XM_CRMASK_CR6FALSE;
 8951    }
 8952    return CR;
 8953
 8954#elif defined(_XM_ARM_NEON_INTRINSICS_)
 8955    __n128 vResult = vceqq_f32( V1, V2 );
 8956    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 8957    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 8958    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 8959
 8960    uint32_t CR = 0;
 8961    if ( r == 0xFFFFFFFFU )
 8962    {
 8963        CR = XM_CRMASK_CR6TRUE;
 8964    }
 8965    else if ( !r )
 8966    {
 8967        CR = XM_CRMASK_CR6FALSE;
 8968    }
 8969    return CR;
 8970#elif defined(_XM_SSE_INTRINSICS_)
 8971    XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
 8972    int iTest = _mm_movemask_ps(vTemp);
 8973    uint32_t CR = 0;
 8974    if (iTest==0xf)     // All equal?
 8975    {
 8976        CR = XM_CRMASK_CR6TRUE;
 8977    }
 8978    else if (iTest==0)  // All not equal?
 8979    {
 8980        CR = XM_CRMASK_CR6FALSE;
 8981    }
 8982    return CR;
 8983#else // _XM_VMX128_INTRINSICS_
 8984#endif // _XM_VMX128_INTRINSICS_
 8985}
 8986
 8987//------------------------------------------------------------------------------
 8988
 8989inline bool XMVector4EqualInt
 8990(
 8991    FXMVECTOR V1, 
 8992    FXMVECTOR V2
 8993)
 8994{
 8995#if defined(_XM_NO_INTRINSICS_)
 8996    return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
 8997#elif defined(_XM_ARM_NEON_INTRINSICS_)
 8998    __n128 vResult = vceqq_u32( V1, V2 );
 8999    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9000    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9001    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 9002#elif defined(_XM_SSE_INTRINSICS_)
 9003    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 9004    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0);
 9005#else
 9006    return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
 9007#endif
 9008}
 9009
 9010//------------------------------------------------------------------------------
 9011
 9012inline uint32_t XMVector4EqualIntR
 9013(
 9014    FXMVECTOR V1, 
 9015    FXMVECTOR V2
 9016)
 9017{
 9018#if defined(_XM_NO_INTRINSICS_)
 9019    uint32_t CR = 0;
 9020    if (V1.vector4_u32[0] == V2.vector4_u32[0] && 
 9021        V1.vector4_u32[1] == V2.vector4_u32[1] &&
 9022        V1.vector4_u32[2] == V2.vector4_u32[2] &&
 9023        V1.vector4_u32[3] == V2.vector4_u32[3])
 9024    {
 9025        CR = XM_CRMASK_CR6TRUE;
 9026    }
 9027    else if (V1.vector4_u32[0] != V2.vector4_u32[0] && 
 9028        V1.vector4_u32[1] != V2.vector4_u32[1] &&
 9029        V1.vector4_u32[2] != V2.vector4_u32[2] &&
 9030        V1.vector4_u32[3] != V2.vector4_u32[3])
 9031    {
 9032        CR = XM_CRMASK_CR6FALSE;
 9033    }
 9034    return CR;
 9035
 9036#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9037    __n128 vResult = vceqq_u32( V1, V2 );
 9038    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9039    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9040    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 9041
 9042    uint32_t CR = 0;
 9043    if ( r == 0xFFFFFFFFU )
 9044    {
 9045        CR = XM_CRMASK_CR6TRUE;
 9046    }
 9047    else if ( !r )
 9048    {
 9049        CR = XM_CRMASK_CR6FALSE;
 9050    }
 9051    return CR;
 9052#elif defined(_XM_SSE_INTRINSICS_)
 9053    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 9054    int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
 9055    uint32_t CR = 0;
 9056    if (iTest==0xf)     // All equal?
 9057    {
 9058        CR = XM_CRMASK_CR6TRUE;
 9059    }
 9060    else if (iTest==0)  // All not equal?
 9061    {
 9062        CR = XM_CRMASK_CR6FALSE;
 9063    }
 9064    return CR;
 9065#else // _XM_VMX128_INTRINSICS_
 9066#endif // _XM_VMX128_INTRINSICS_
 9067}
 9068
 9069inline bool XMVector4NearEqual
 9070(
 9071    FXMVECTOR V1, 
 9072    FXMVECTOR V2, 
 9073    FXMVECTOR Epsilon
 9074)
 9075{
 9076#if defined(_XM_NO_INTRINSICS_)
 9077    float dx, dy, dz, dw;
 9078
 9079    dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
 9080    dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
 9081    dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
 9082    dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]);
 9083    return (((dx <= Epsilon.vector4_f32[0]) &&
 9084            (dy <= Epsilon.vector4_f32[1]) &&
 9085            (dz <= Epsilon.vector4_f32[2]) &&
 9086            (dw <= Epsilon.vector4_f32[3])) != 0);
 9087#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9088    __n128 vDelta = vsubq_f32( V1, V2 );
 9089    __n128 vResult = vacleq_f32( vDelta, Epsilon );
 9090    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9091    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9092    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 9093#elif defined(_XM_SSE_INTRINSICS_)
 9094    // Get the difference
 9095    XMVECTOR vDelta = _mm_sub_ps(V1,V2);
 9096    // Get the absolute value of the difference
 9097    XMVECTOR vTemp = _mm_setzero_ps();
 9098    vTemp = _mm_sub_ps(vTemp,vDelta);
 9099    vTemp = _mm_max_ps(vTemp,vDelta);
 9100    vTemp = _mm_cmple_ps(vTemp,Epsilon);
 9101    return ((_mm_movemask_ps(vTemp)==0xf) != 0);
 9102#else // _XM_VMX128_INTRINSICS_
 9103#endif // _XM_VMX128_INTRINSICS_
 9104}
 9105
 9106//------------------------------------------------------------------------------
 9107
 9108inline bool XMVector4NotEqual
 9109(
 9110    FXMVECTOR V1, 
 9111    FXMVECTOR V2
 9112)
 9113{
 9114#if defined(_XM_NO_INTRINSICS_)
 9115    return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
 9116#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9117    __n128 vResult = vceqq_f32( V1, V2 );
 9118    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9119    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9120    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
 9121#elif defined(_XM_SSE_INTRINSICS_)
 9122    XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2);
 9123    return ((_mm_movemask_ps(vTemp)) != 0);
 9124#else
 9125    return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
 9126#endif
 9127}
 9128
 9129//------------------------------------------------------------------------------
 9130
 9131inline bool XMVector4NotEqualInt
 9132(
 9133    FXMVECTOR V1, 
 9134    FXMVECTOR V2
 9135)
 9136{
 9137#if defined(_XM_NO_INTRINSICS_)
 9138    return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
 9139#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9140    __n128 vResult = vceqq_u32( V1, V2 );
 9141    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9142    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9143    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
 9144#elif defined(_XM_SSE_INTRINSICS_)
 9145    __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
 9146    return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0);
 9147#else
 9148    return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
 9149#endif
 9150}
 9151
 9152//------------------------------------------------------------------------------
 9153
 9154inline bool XMVector4Greater
 9155(
 9156    FXMVECTOR V1, 
 9157    FXMVECTOR V2
 9158)
 9159{
 9160#if defined(_XM_NO_INTRINSICS_)
 9161    return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
 9162#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9163    __n128 vResult = vcgtq_f32( V1, V2 );
 9164    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9165    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9166    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 9167#elif defined(_XM_SSE_INTRINSICS_)
 9168    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
 9169    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
 9170#else
 9171    return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
 9172#endif
 9173}
 9174
 9175//------------------------------------------------------------------------------
 9176
 9177inline uint32_t XMVector4GreaterR
 9178(
 9179    FXMVECTOR V1, 
 9180    FXMVECTOR V2
 9181)
 9182{
 9183#if defined(_XM_NO_INTRINSICS_)
 9184    uint32_t CR = 0;
 9185    if (V1.vector4_f32[0] > V2.vector4_f32[0] && 
 9186        V1.vector4_f32[1] > V2.vector4_f32[1] &&
 9187        V1.vector4_f32[2] > V2.vector4_f32[2] &&
 9188        V1.vector4_f32[3] > V2.vector4_f32[3])
 9189    {
 9190        CR = XM_CRMASK_CR6TRUE;
 9191    }
 9192    else if (V1.vector4_f32[0] <= V2.vector4_f32[0] && 
 9193        V1.vector4_f32[1] <= V2.vector4_f32[1] &&
 9194        V1.vector4_f32[2] <= V2.vector4_f32[2] &&
 9195        V1.vector4_f32[3] <= V2.vector4_f32[3])
 9196    {
 9197        CR = XM_CRMASK_CR6FALSE;
 9198    }
 9199    return CR;
 9200
 9201#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9202    __n128 vResult = vcgtq_f32( V1, V2 );
 9203    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9204    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9205    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 9206
 9207    uint32_t CR = 0;
 9208    if ( r == 0xFFFFFFFFU )
 9209    {
 9210        CR = XM_CRMASK_CR6TRUE;
 9211    }
 9212    else if ( !r )
 9213    {
 9214        CR = XM_CRMASK_CR6FALSE;
 9215    }
 9216    return CR;
 9217#elif defined(_XM_SSE_INTRINSICS_)
 9218    uint32_t CR = 0;
 9219    XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
 9220    int iTest = _mm_movemask_ps(vTemp);
 9221    if (iTest==0xf) {
 9222        CR = XM_CRMASK_CR6TRUE;
 9223    }
 9224    else if (!iTest)
 9225    {
 9226        CR = XM_CRMASK_CR6FALSE;
 9227    }
 9228    return CR;
 9229#else // _XM_VMX128_INTRINSICS_
 9230#endif // _XM_VMX128_INTRINSICS_
 9231}
 9232
 9233//------------------------------------------------------------------------------
 9234
 9235inline bool XMVector4GreaterOrEqual
 9236(
 9237    FXMVECTOR V1, 
 9238    FXMVECTOR V2
 9239)
 9240{
 9241#if defined(_XM_NO_INTRINSICS_)
 9242    return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
 9243#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9244    __n128 vResult = vcgeq_f32( V1, V2 );
 9245    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9246    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9247    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 9248#elif defined(_XM_SSE_INTRINSICS_)
 9249    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
 9250    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
 9251#else
 9252    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
 9253#endif
 9254}
 9255
 9256//------------------------------------------------------------------------------
 9257
 9258inline uint32_t XMVector4GreaterOrEqualR
 9259(
 9260    FXMVECTOR V1, 
 9261    FXMVECTOR V2
 9262)
 9263{
 9264#if defined(_XM_NO_INTRINSICS_)
 9265    uint32_t CR = 0;
 9266    if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) && 
 9267        (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
 9268        (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
 9269        (V1.vector4_f32[3] >= V2.vector4_f32[3]))
 9270    {
 9271        CR = XM_CRMASK_CR6TRUE;
 9272    }
 9273    else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) && 
 9274        (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
 9275        (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
 9276        (V1.vector4_f32[3] < V2.vector4_f32[3]))
 9277    {
 9278        CR = XM_CRMASK_CR6FALSE;
 9279    }
 9280    return CR;
 9281
 9282#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9283    __n128 vResult = vcgeq_f32( V1, V2 );
 9284    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9285    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9286    uint32_t r = vget_lane_u32(vTemp.val[1], 1);
 9287
 9288    uint32_t CR = 0;
 9289    if ( r == 0xFFFFFFFFU )
 9290    {
 9291        CR = XM_CRMASK_CR6TRUE;
 9292    }
 9293    else if ( !r )
 9294    {
 9295        CR = XM_CRMASK_CR6FALSE;
 9296    }
 9297    return CR;
 9298#elif defined(_XM_SSE_INTRINSICS_)
 9299    uint32_t CR = 0;
 9300    XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
 9301    int iTest = _mm_movemask_ps(vTemp);
 9302    if (iTest==0x0f)
 9303    {
 9304        CR = XM_CRMASK_CR6TRUE;
 9305    }
 9306    else if (!iTest)
 9307    {
 9308        CR = XM_CRMASK_CR6FALSE;
 9309    }
 9310    return CR;
 9311#else // _XM_VMX128_INTRINSICS_
 9312#endif // _XM_VMX128_INTRINSICS_
 9313}
 9314
 9315//------------------------------------------------------------------------------
 9316
 9317inline bool XMVector4Less
 9318(
 9319    FXMVECTOR V1, 
 9320    FXMVECTOR V2
 9321)
 9322{
 9323#if defined(_XM_NO_INTRINSICS_)
 9324    return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
 9325#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9326    __n128 vResult = vcltq_f32( V1, V2 );
 9327    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9328    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9329    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 9330#elif defined(_XM_SSE_INTRINSICS_)
 9331    XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
 9332    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
 9333#else
 9334    return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
 9335#endif
 9336}
 9337
 9338//------------------------------------------------------------------------------
 9339
 9340inline bool XMVector4LessOrEqual
 9341(
 9342    FXMVECTOR V1, 
 9343    FXMVECTOR V2
 9344)
 9345{
 9346#if defined(_XM_NO_INTRINSICS_)
 9347    return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
 9348#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9349    __n128 vResult = vcleq_f32( V1, V2 );
 9350    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
 9351    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9352    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 9353#elif defined(_XM_SSE_INTRINSICS_)
 9354    XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
 9355    return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
 9356#else
 9357    return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
 9358#endif
 9359}
 9360
 9361//------------------------------------------------------------------------------
 9362
 9363inline bool XMVector4InBounds
 9364(
 9365    FXMVECTOR V, 
 9366    FXMVECTOR Bounds
 9367)
 9368{
 9369#if defined(_XM_NO_INTRINSICS_)
 9370    return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) && 
 9371        (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
 9372        (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
 9373        (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
 9374#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9375    // Test if less than or equal
 9376    __n128 vTemp1 = vcleq_f32(V,Bounds);
 9377    // Negate the bounds
 9378    __n128 vTemp2 = vnegq_f32(Bounds);
 9379    // Test if greater or equal (Reversed)
 9380    vTemp2 = vcleq_f32(vTemp2,V);
 9381    // Blend answers
 9382    vTemp1 = vandq_u32(vTemp1,vTemp2);
 9383    // in bounds?
 9384    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
 9385    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9386    return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
 9387#elif defined(_XM_SSE_INTRINSICS_)
 9388    // Test if less than or equal
 9389    XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
 9390    // Negate the bounds
 9391    XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
 9392    // Test if greater or equal (Reversed)
 9393    vTemp2 = _mm_cmple_ps(vTemp2,V);
 9394    // Blend answers
 9395    vTemp1 = _mm_and_ps(vTemp1,vTemp2);
 9396    // All in bounds?
 9397    return ((_mm_movemask_ps(vTemp1)==0x0f) != 0);
 9398#else
 9399    return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
 9400#endif
 9401}
 9402
 9403
 9404//------------------------------------------------------------------------------
 9405
 9406inline bool XMVector4IsNaN
 9407(
 9408    FXMVECTOR V
 9409)
 9410{
 9411#if defined(_XM_NO_INTRINSICS_)
 9412    return (XMISNAN(V.vector4_f32[0]) ||
 9413            XMISNAN(V.vector4_f32[1]) ||
 9414            XMISNAN(V.vector4_f32[2]) ||
 9415            XMISNAN(V.vector4_f32[3]));
 9416#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9417    // Test against itself. NaN is always not equal
 9418    __n128 vTempNan = vceqq_f32( V, V );
 9419    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
 9420    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9421    // If any are NaN, the mask is zero
 9422    return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
 9423#elif defined(_XM_SSE_INTRINSICS_)
 9424    // Test against itself. NaN is always not equal
 9425    XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
 9426    // If any are NaN, the mask is non-zero
 9427    return (_mm_movemask_ps(vTempNan)!=0);
 9428#else // _XM_VMX128_INTRINSICS_
 9429#endif // _XM_VMX128_INTRINSICS_
 9430}
 9431
 9432//------------------------------------------------------------------------------
 9433
 9434inline bool XMVector4IsInfinite
 9435(
 9436    FXMVECTOR V
 9437)
 9438{
 9439#if defined(_XM_NO_INTRINSICS_)
 9440
 9441    return (XMISINF(V.vector4_f32[0]) ||
 9442            XMISINF(V.vector4_f32[1]) ||
 9443            XMISINF(V.vector4_f32[2]) ||
 9444            XMISINF(V.vector4_f32[3]));
 9445
 9446#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9447    // Mask off the sign bit
 9448    __n128 vTempInf = vandq_u32( V, g_XMAbsMask );
 9449    // Compare to infinity
 9450    vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
 9451    // If any are infinity, the signs are true.
 9452    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
 9453    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
 9454    return ( vget_lane_u32(vTemp.val[1], 1) != 0 );
 9455#elif defined(_XM_SSE_INTRINSICS_)
 9456    // Mask off the sign bit
 9457    XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask);
 9458    // Compare to infinity
 9459    vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
 9460    // If any are infinity, the signs are true.
 9461    return (_mm_movemask_ps(vTemp) != 0);
 9462#else // _XM_VMX128_INTRINSICS_
 9463#endif // _XM_VMX128_INTRINSICS_
 9464}
 9465
 9466//------------------------------------------------------------------------------
 9467// Computation operations
 9468//------------------------------------------------------------------------------
 9469
 9470//------------------------------------------------------------------------------
 9471
 9472inline XMVECTOR XMVector4Dot
 9473(
 9474    FXMVECTOR V1, 
 9475    FXMVECTOR V2
 9476)
 9477{
 9478#if defined(_XM_NO_INTRINSICS_)
 9479
 9480    XMVECTOR Result;
 9481    Result.vector4_f32[0] =
 9482    Result.vector4_f32[1] =
 9483    Result.vector4_f32[2] =
 9484    Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
 9485    return Result;
 9486
 9487#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9488    __n128 vTemp = vmulq_f32( V1, V2 );
 9489    __n64 v1 = vget_low_f32( vTemp );
 9490    __n64 v2 = vget_high_f32( vTemp );
 9491    v1 = vpadd_f32( v1, v1 );
 9492    v2 = vpadd_f32( v2, v2 );
 9493    v1 = vadd_f32( v1, v2 );
 9494    return vcombine_f32( v1, v1 );
 9495#elif defined(_XM_SSE_INTRINSICS_)
 9496    XMVECTOR vTemp2 = V2;
 9497    XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
 9498    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
 9499    vTemp2 = _mm_add_ps(vTemp2,vTemp);          // Add Z = X+Z; W = Y+W;
 9500    vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0));  // Copy W to the Z position
 9501    vTemp = _mm_add_ps(vTemp,vTemp2);           // Add Z and W together
 9502    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2));    // Splat Z and return
 9503#else // _XM_VMX128_INTRINSICS_
 9504#endif // _XM_VMX128_INTRINSICS_
 9505}
 9506
 9507//------------------------------------------------------------------------------
 9508
 9509inline XMVECTOR XMVector4Cross
 9510(
 9511    FXMVECTOR V1, 
 9512    FXMVECTOR V2, 
 9513    FXMVECTOR V3
 9514)
 9515{
 9516    // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
 9517    //   ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
 9518    //   ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
 9519    //   ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ]
 9520
 9521#if defined(_XM_NO_INTRINSICS_)
 9522    XMVECTOR Result;   
 9523
 9524    Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]);
 9525    Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]);
 9526    Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]);
 9527    Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]);
 9528    return Result;
 9529
 9530#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9531    const __n64 select = vget_low_f32( g_XMMaskX );
 9532
 9533    // Term1: V2zwyz * V3wzwy
 9534    const __n64 v2xy = vget_low_f32(V2);
 9535    const __n64 v2zw = vget_high_f32(V2);
 9536    const __n64 v2yx = vrev64_f32(v2xy);
 9537    const __n64 v2wz = vrev64_f32(v2zw);
 9538    const __n64 v2yz = vbsl_f32( select, v2yx, v2wz );
 9539
 9540    const __n64 v3zw = vget_high_f32(V3);
 9541    const __n64 v3wz = vrev64_f32(v3zw);
 9542    const __n64 v3xy = vget_low_f32(V3);
 9543    const __n64 v3wy = vbsl_f32( select, v3wz, v3xy );
 9544
 9545    __n128 vTemp1 = vcombine_f32(v2zw,v2yz);
 9546    __n128 vTemp2 = vcombine_f32(v3wz,v3wy);
 9547    __n128 vResult = vmulq_f32( vTemp1, vTemp2 );
 9548
 9549    // - V2wzwy * V3zwyz
 9550    const __n64 v2wy = vbsl_f32( select, v2wz, v2xy );
 9551
 9552    const __n64 v3yx = vrev64_f32(v3xy);
 9553    const __n64 v3yz = vbsl_f32( select, v3yx, v3wz );
 9554
 9555    vTemp1 = vcombine_f32(v2wz,v2wy);
 9556    vTemp2 = vcombine_f32(v3zw,v3yz);
 9557    vResult = vmlsq_f32( vResult, vTemp1, vTemp2 );
 9558
 9559    // term1 * V1yxxx
 9560    const __n64 v1xy = vget_low_f32(V1);
 9561    const __n64 v1yx = vrev64_f32(v1xy);
 9562
 9563    vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) );
 9564    vResult = vmulq_f32( vResult, vTemp1 );
 9565
 9566    // Term2: V2ywxz * V3wxwx
 9567    const __n64 v2yw = vrev64_f32(v2wy);
 9568    const __n64 v2xz = vbsl_f32( select, v2xy, v2wz );
 9569
 9570    const __n64 v3wx = vbsl_f32( select, v3wz, v3yx );
 9571
 9572    vTemp1 = vcombine_f32(v2yw,v2xz);
 9573    vTemp2 = vcombine_f32(v3wx,v3wx);
 9574    __n128 vTerm = vmulq_f32( vTemp1, vTemp2 );
 9575
 9576    // - V2wxwx * V3ywxz
 9577    const __n64 v2wx = vbsl_f32( select, v2wz, v2yx );
 9578
 9579    const __n64 v3yw = vrev64_f32(v3wy);
 9580    const __n64 v3xz = vbsl_f32( select, v3xy, v3wz );
 9581
 9582    vTemp1 = vcombine_f32(v2wx,v2wx);
 9583    vTemp2 = vcombine_f32(v3yw,v3xz);
 9584    vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
 9585
 9586    // vResult - term2 * V1zzyy
 9587    const __n64 v1zw = vget_high_f32(V1);
 9588
 9589    vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) );
 9590    vResult = vmlsq_f32( vResult, vTerm, vTemp1 );
 9591
 9592    // Term3: V2yzxy * V3zxyx
 9593    const __n64 v3zx = vrev64_f32(v3xz);
 9594
 9595    vTemp1 = vcombine_f32(v2yz,v2xy);
 9596    vTemp2 = vcombine_f32(v3zx,v3yx);
 9597    vTerm = vmulq_f32( vTemp1, vTemp2 );
 9598
 9599    // - V2zxyx * V3yzxy
 9600    const __n64 v2zx = vrev64_f32(v2xz);
 9601
 9602    vTemp1 = vcombine_f32(v2zx,v2yx);
 9603    vTemp2 = vcombine_f32(v3yz,v3xy);
 9604    vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
 9605
 9606    // vResult + term3 * V1wwwz
 9607    const __n64 v1wz = vrev64_f32(v1zw);
 9608
 9609    vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz );
 9610    return vmlaq_f32( vResult, vTerm, vTemp1 );
 9611#elif defined(_XM_SSE_INTRINSICS_)
 9612    // V2zwyz * V3wzwy
 9613    XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2));
 9614    XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3));
 9615    vResult = _mm_mul_ps(vResult,vTemp3);
 9616    // - V2wzwy * V3zwyz
 9617    XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3));
 9618    vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1));
 9619    vTemp2 = _mm_mul_ps(vTemp2,vTemp3);
 9620    vResult = _mm_sub_ps(vResult,vTemp2);
 9621    // term1 * V1yxxx
 9622    XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1));
 9623    vResult = _mm_mul_ps(vResult,vTemp1);
 9624
 9625    // V2ywxz * V3wxwx
 9626    vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1));
 9627    vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3));
 9628    vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
 9629    // - V2wxwx * V3ywxz
 9630    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1));
 9631    vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1));
 9632    vTemp2 = _mm_mul_ps(vTemp2,vTemp1);
 9633    vTemp3 = _mm_sub_ps(vTemp3,vTemp2);
 9634    // vResult - temp * V1zzyy
 9635    vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2));
 9636    vTemp1 = _mm_mul_ps(vTemp1,vTemp3);
 9637    vResult = _mm_sub_ps(vResult,vTemp1);
 9638
 9639    // V2yzxy * V3zxyx
 9640    vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1));
 9641    vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2));
 9642    vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
 9643    // - V2zxyx * V3yzxy
 9644    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1));
 9645    vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1));
 9646    vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
 9647    vTemp3 = _mm_sub_ps(vTemp3,vTemp1);
 9648    // vResult + term * V1wwwz
 9649    vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3));
 9650    vTemp3 = _mm_mul_ps(vTemp3,vTemp1);
 9651    vResult = _mm_add_ps(vResult,vTemp3);
 9652    return vResult;
 9653#else // _XM_VMX128_INTRINSICS_
 9654#endif // _XM_VMX128_INTRINSICS_
 9655}
 9656
 9657//------------------------------------------------------------------------------
 9658
 9659inline XMVECTOR XMVector4LengthSq
 9660(
 9661    FXMVECTOR V
 9662)
 9663{
 9664    return XMVector4Dot(V, V);
 9665}
 9666
 9667//------------------------------------------------------------------------------
 9668
 9669inline XMVECTOR XMVector4ReciprocalLengthEst
 9670(
 9671    FXMVECTOR V
 9672)
 9673{
 9674#if defined(_XM_NO_INTRINSICS_)
 9675
 9676    XMVECTOR Result;
 9677
 9678    Result = XMVector4LengthSq(V);
 9679    Result = XMVectorReciprocalSqrtEst(Result);
 9680
 9681    return Result;
 9682
 9683#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9684    // Dot4
 9685    __n128 vTemp = vmulq_f32( V, V );
 9686    __n64 v1 = vget_low_f32( vTemp );
 9687    __n64 v2 = vget_high_f32( vTemp );
 9688    v1 = vpadd_f32( v1, v1 );
 9689    v2 = vpadd_f32( v2, v2 );
 9690    v1 = vadd_f32( v1, v2 );
 9691    // Reciprocal sqrt (estimate)
 9692    v2 = vrsqrte_f32( v1 );
 9693    return vcombine_f32(v2, v2);
 9694#elif defined(_XM_SSE_INTRINSICS_)
 9695    // Perform the dot product on x,y,z and w
 9696    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 9697    // vTemp has z and w
 9698    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
 9699    // x+z, y+w
 9700    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9701    // x+z,x+z,x+z,y+w
 9702    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
 9703    // ??,??,y+w,y+w
 9704    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
 9705    // ??,??,x+z+y+w,??
 9706    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9707    // Splat the length
 9708    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
 9709    // Get the reciprocal
 9710    vLengthSq = _mm_rsqrt_ps(vLengthSq);
 9711    return vLengthSq;
 9712#else // _XM_VMX128_INTRINSICS_
 9713#endif // _XM_VMX128_INTRINSICS_
 9714}
 9715
 9716//------------------------------------------------------------------------------
 9717
 9718inline XMVECTOR XMVector4ReciprocalLength
 9719(
 9720    FXMVECTOR V
 9721)
 9722{
 9723#if defined(_XM_NO_INTRINSICS_)
 9724
 9725    XMVECTOR Result;
 9726
 9727    Result = XMVector4LengthSq(V);
 9728    Result = XMVectorReciprocalSqrt(Result);
 9729
 9730    return Result;
 9731
 9732#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9733    // Dot4
 9734    __n128 vTemp = vmulq_f32( V, V );
 9735    __n64 v1 = vget_low_f32( vTemp );
 9736    __n64 v2 = vget_high_f32( vTemp );
 9737    v1 = vpadd_f32( v1, v1 );
 9738    v2 = vpadd_f32( v2, v2 );
 9739    v1 = vadd_f32( v1, v2 );
 9740    // Reciprocal sqrt
 9741    __n64  S0 = vrsqrte_f32(v1);
 9742    __n64  P0 = vmul_f32( v1, S0 );
 9743    __n64  R0 = vrsqrts_f32( P0, S0 );
 9744    __n64  S1 = vmul_f32( S0, R0 );
 9745    __n64  P1 = vmul_f32( v1, S1 );
 9746    __n64  R1 = vrsqrts_f32( P1, S1 );
 9747    __n64 Result = vmul_f32( S1, R1 );
 9748    return vcombine_f32( Result, Result );
 9749#elif defined(_XM_SSE_INTRINSICS_)
 9750    // Perform the dot product on x,y,z and w
 9751    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 9752    // vTemp has z and w
 9753    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
 9754    // x+z, y+w
 9755    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9756    // x+z,x+z,x+z,y+w
 9757    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
 9758    // ??,??,y+w,y+w
 9759    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
 9760    // ??,??,x+z+y+w,??
 9761    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9762    // Splat the length
 9763    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
 9764    // Get the reciprocal
 9765    vLengthSq = _mm_sqrt_ps(vLengthSq);
 9766    // Accurate!
 9767    vLengthSq = _mm_div_ps(g_XMOne,vLengthSq);
 9768    return vLengthSq;
 9769#else // _XM_VMX128_INTRINSICS_
 9770#endif // _XM_VMX128_INTRINSICS_
 9771}
 9772
 9773//------------------------------------------------------------------------------
 9774
 9775inline XMVECTOR XMVector4LengthEst
 9776(
 9777    FXMVECTOR V
 9778)
 9779{
 9780#if defined(_XM_NO_INTRINSICS_)
 9781
 9782    XMVECTOR Result;
 9783
 9784    Result = XMVector4LengthSq(V);
 9785    Result = XMVectorSqrtEst(Result);
 9786
 9787    return Result;
 9788
 9789#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9790    // Dot4
 9791    __n128 vTemp = vmulq_f32( V, V );
 9792    __n64 v1 = vget_low_f32( vTemp );
 9793    __n64 v2 = vget_high_f32( vTemp );
 9794    v1 = vpadd_f32( v1, v1 );
 9795    v2 = vpadd_f32( v2, v2 );
 9796    v1 = vadd_f32( v1, v2 );
 9797    const __n64 zero = vdup_n_u32(0);
 9798    __n64 VEqualsZero = vceq_f32( v1, zero );
 9799    // Sqrt (estimate)
 9800    __n64 Result = vrsqrte_f32( v1 );
 9801    Result = vmul_f32( v1, Result );
 9802    Result = vbsl_f32( VEqualsZero, zero, Result );
 9803    return vcombine_f32( Result, Result );
 9804#elif defined(_XM_SSE_INTRINSICS_)
 9805    // Perform the dot product on x,y,z and w
 9806    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 9807    // vTemp has z and w
 9808    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
 9809    // x+z, y+w
 9810    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9811    // x+z,x+z,x+z,y+w
 9812    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
 9813    // ??,??,y+w,y+w
 9814    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
 9815    // ??,??,x+z+y+w,??
 9816    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9817    // Splat the length
 9818    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
 9819    // Prepare for the division
 9820    vLengthSq = _mm_sqrt_ps(vLengthSq);
 9821    return vLengthSq;
 9822#else // _XM_VMX128_INTRINSICS_
 9823#endif // _XM_VMX128_INTRINSICS_
 9824}
 9825
 9826//------------------------------------------------------------------------------
 9827
 9828inline XMVECTOR XMVector4Length
 9829(
 9830    FXMVECTOR V
 9831)
 9832{
 9833#if defined(_XM_NO_INTRINSICS_) 
 9834
 9835    XMVECTOR Result;
 9836
 9837    Result = XMVector4LengthSq(V);
 9838    Result = XMVectorSqrt(Result);
 9839
 9840    return Result;
 9841
 9842#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9843    // Dot4
 9844    __n128 vTemp = vmulq_f32( V, V );
 9845    __n64 v1 = vget_low_f32( vTemp );
 9846    __n64 v2 = vget_high_f32( vTemp );
 9847    v1 = vpadd_f32( v1, v1 );
 9848    v2 = vpadd_f32( v2, v2 );
 9849    v1 = vadd_f32( v1, v2 );
 9850    const __n64 zero = vdup_n_u32(0);
 9851    __n64 VEqualsZero = vceq_f32( v1, zero );
 9852    // Sqrt
 9853    __n64 S0 = vrsqrte_f32( v1 );
 9854    __n64 P0 = vmul_f32( v1, S0 );
 9855    __n64 R0 = vrsqrts_f32( P0, S0 );
 9856    __n64 S1 = vmul_f32( S0, R0 );
 9857    __n64 P1 = vmul_f32( v1, S1 );
 9858    __n64 R1 = vrsqrts_f32( P1, S1 );
 9859    __n64 Result = vmul_f32( S1, R1 );
 9860    Result = vmul_f32( v1, Result );
 9861    Result = vbsl_f32( VEqualsZero, zero, Result );
 9862    return vcombine_f32( Result, Result );
 9863#elif defined(_XM_SSE_INTRINSICS_)
 9864    // Perform the dot product on x,y,z and w
 9865    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 9866    // vTemp has z and w
 9867    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
 9868    // x+z, y+w
 9869    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9870    // x+z,x+z,x+z,y+w
 9871    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
 9872    // ??,??,y+w,y+w
 9873    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
 9874    // ??,??,x+z+y+w,??
 9875    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9876    // Splat the length
 9877    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
 9878    // Prepare for the division
 9879    vLengthSq = _mm_sqrt_ps(vLengthSq);
 9880    return vLengthSq;
 9881#else // _XM_VMX128_INTRINSICS_
 9882#endif // _XM_VMX128_INTRINSICS_
 9883}
 9884
 9885//------------------------------------------------------------------------------
 9886// XMVector4NormalizeEst uses a reciprocal estimate and
 9887// returns QNaN on zero and infinite vectors.
 9888
 9889inline XMVECTOR XMVector4NormalizeEst
 9890(
 9891    FXMVECTOR V
 9892)
 9893{
 9894#if defined(_XM_NO_INTRINSICS_)
 9895
 9896    XMVECTOR Result;
 9897    Result = XMVector4ReciprocalLength(V);
 9898    Result = XMVectorMultiply(V, Result);
 9899    return Result;
 9900
 9901#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9902    // Dot4
 9903    __n128 vTemp = vmulq_f32( V, V );
 9904    __n64 v1 = vget_low_f32( vTemp );
 9905    __n64 v2 = vget_high_f32( vTemp );
 9906    v1 = vpadd_f32( v1, v1 );
 9907    v2 = vpadd_f32( v2, v2 );
 9908    v1 = vadd_f32( v1, v2 );
 9909    // Reciprocal sqrt (estimate)
 9910    v2 = vrsqrte_f32( v1 );
 9911    // Normalize
 9912    return vmulq_f32( V, vcombine_f32(v2,v2) );
 9913#elif defined(_XM_SSE_INTRINSICS_)
 9914    // Perform the dot product on x,y,z and w
 9915    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 9916    // vTemp has z and w
 9917    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
 9918    // x+z, y+w
 9919    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9920    // x+z,x+z,x+z,y+w
 9921    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
 9922    // ??,??,y+w,y+w
 9923    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
 9924    // ??,??,x+z+y+w,??
 9925    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9926    // Splat the length
 9927    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
 9928    // Get the reciprocal
 9929    XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
 9930    // Reciprocal mul to perform the normalization
 9931    vResult = _mm_mul_ps(vResult,V);
 9932    return vResult;
 9933#else // _XM_VMX128_INTRINSICS_
 9934#endif // _XM_VMX128_INTRINSICS_
 9935}
 9936
 9937//------------------------------------------------------------------------------
 9938
 9939inline XMVECTOR XMVector4Normalize
 9940(
 9941    FXMVECTOR V
 9942)
 9943{
 9944#if defined(_XM_NO_INTRINSICS_)
 9945    float fLength;
 9946    XMVECTOR vResult;
 9947
 9948    vResult = XMVector4Length( V );
 9949    fLength = vResult.vector4_f32[0];
 9950
 9951    // Prevent divide by zero
 9952    if (fLength > 0) {
 9953        fLength = 1.0f/fLength;
 9954    }
 9955    
 9956    vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
 9957    vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
 9958    vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
 9959    vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
 9960    return vResult;
 9961
 9962#elif defined(_XM_ARM_NEON_INTRINSICS_)
 9963    // Dot4
 9964    __n128 vTemp = vmulq_f32( V, V );
 9965    __n64 v1 = vget_low_f32( vTemp );
 9966    __n64 v2 = vget_high_f32( vTemp );
 9967    v1 = vpadd_f32( v1, v1 );
 9968    v2 = vpadd_f32( v2, v2 );
 9969    v1 = vadd_f32( v1, v2 );
 9970    __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) );
 9971    __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
 9972    // Reciprocal sqrt (2 iterations of Newton-Raphson)
 9973    __n64 S0 = vrsqrte_f32( v1 );
 9974    __n64 P0 = vmul_f32( v1, S0 );
 9975    __n64 R0 = vrsqrts_f32( P0, S0 );
 9976    __n64 S1 = vmul_f32( S0, R0 );
 9977    __n64 P1 = vmul_f32( v1, S1 );
 9978    __n64 R1 = vrsqrts_f32( P1, S1 );
 9979    v2 = vmul_f32( S1, R1 );
 9980    // Normalize
 9981    __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
 9982    vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
 9983    return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
 9984#elif defined(_XM_SSE_INTRINSICS_)
 9985    // Perform the dot product on x,y,z and w
 9986    XMVECTOR vLengthSq = _mm_mul_ps(V,V);
 9987    // vTemp has z and w
 9988    XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
 9989    // x+z, y+w
 9990    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9991    // x+z,x+z,x+z,y+w
 9992    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
 9993    // ??,??,y+w,y+w
 9994    vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
 9995    // ??,??,x+z+y+w,??
 9996    vLengthSq = _mm_add_ps(vLengthSq,vTemp);
 9997    // Splat the length
 9998    vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
 9999    // Prepare for the division
10000    XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
10001    // Create zero with a single instruction
10002    XMVECTOR vZeroMask = _mm_setzero_ps();
10003    // Test for a divide by zero (Must be FP to detect -0.0)
10004    vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
10005    // Failsafe on zero (Or epsilon) length planes
10006    // If the length is infinity, set the elements to zero
10007    vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
10008    // Divide to perform the normalization
10009    vResult = _mm_div_ps(V,vResult);
10010    // Any that are infinity, set to zero
10011    vResult = _mm_and_ps(vResult,vZeroMask);
10012    // Select qnan or result based on infinite length
10013    XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
10014    XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
10015    vResult = _mm_or_ps(vTemp1,vTemp2);
10016    return vResult;
10017#else // _XM_VMX128_INTRINSICS_
10018#endif // _XM_VMX128_INTRINSICS_
10019}
10020
10021//------------------------------------------------------------------------------
10022
10023inline XMVECTOR XMVector4ClampLength
10024(
10025    FXMVECTOR V, 
10026    float    LengthMin, 
10027    float    LengthMax
10028)
10029{
10030    XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
10031    XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
10032
10033    return XMVector4ClampLengthV(V, ClampMin, ClampMax);
10034}
10035
10036//------------------------------------------------------------------------------
10037
10038inline XMVECTOR XMVector4ClampLengthV
10039(
10040    FXMVECTOR V, 
10041    FXMVECTOR LengthMin, 
10042    FXMVECTOR LengthMax
10043)
10044{
10045    assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
10046    assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
10047    assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
10048    assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
10049    assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));
10050
10051    XMVECTOR LengthSq = XMVector4LengthSq(V);
10052
10053    const XMVECTOR Zero = XMVectorZero();
10054
10055    XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
10056
10057    XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
10058    XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
10059
10060    XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
10061
10062    XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
10063
10064    XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
10065    Length = XMVectorSelect(LengthSq, Length, Select);
10066    Normal = XMVectorSelect(LengthSq, Normal, Select);
10067
10068    XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
10069    XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
10070
10071    XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
10072    ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
10073
10074    XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
10075
10076    // Preserve the original vector (with no precision loss) if the length falls within the given range
10077    XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
10078    Result = XMVectorSelect(Result, V, Control);
10079
10080    return Result;
10081}
10082
10083//------------------------------------------------------------------------------
10084
10085inline XMVECTOR XMVector4Reflect
10086(
10087    FXMVECTOR Incident, 
10088    FXMVECTOR Normal
10089)
10090{
10091    // Result = Incident - (2 * dot(Incident, Normal)) * Normal
10092
10093    XMVECTOR Result = XMVector4Dot(Incident, Normal);
10094    Result = XMVectorAdd(Result, Result);
10095    Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
10096
10097    return Result;
10098}
10099
10100//------------------------------------------------------------------------------
10101
10102inline XMVECTOR XMVector4Refract
10103(
10104    FXMVECTOR Incident, 
10105    FXMVECTOR Normal, 
10106    float    RefractionIndex
10107)
10108{
10109    XMVECTOR Index = XMVectorReplicate(RefractionIndex);
10110    return XMVector4RefractV(Incident, Normal, Index);
10111}
10112
10113//------------------------------------------------------------------------------
10114
10115inline XMVECTOR XMVector4RefractV
10116(
10117    FXMVECTOR Incident, 
10118    FXMVECTOR Normal, 
10119    FXMVECTOR RefractionIndex
10120)
10121{
10122#if defined(_XM_NO_INTRINSICS_)
10123
10124    XMVECTOR        IDotN;
10125    XMVECTOR        R;
10126    const XMVECTOR  Zero = XMVectorZero();
10127
10128    // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) + 
10129    // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
10130
10131    IDotN = XMVector4Dot(Incident, Normal);
10132
10133    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
10134    R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
10135    R = XMVectorMultiply(R, RefractionIndex);
10136    R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
10137
10138    if (XMVector4LessOrEqual(R, Zero))
10139    {
10140        // Total internal reflection
10141        return Zero;
10142    }
10143    else
10144    {
10145        XMVECTOR Result;
10146
10147        // R = RefractionIndex * IDotN + sqrt(R)
10148        R = XMVectorSqrt(R);
10149        R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
10150
10151        // Result = RefractionIndex * Incident - Normal * R
10152        Result = XMVectorMultiply(RefractionIndex, Incident);
10153        Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
10154
10155        return Result;
10156    }
10157
10158#elif defined(_XM_ARM_NEON_INTRINSICS_)
10159    XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
10160
10161    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
10162    __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN);
10163    R = vmulq_f32(R, RefractionIndex);
10164    R = vmlsq_f32(g_XMOne, R, RefractionIndex );
10165
10166    __n128 vResult = vcleq_f32(R,g_XMZero);
10167    int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
10168    vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
10169    if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
10170    {
10171        // Total internal reflection
10172        vResult = g_XMZero;
10173    }
10174    else
10175    {
10176        // Sqrt(R)
10177        __n128 S0 = vrsqrteq_f32(R);
10178        __n128 P0 = vmulq_f32( R, S0 );
10179        __n128 R0 = vrsqrtsq_f32( P0, S0 );
10180        __n128 S1 = vmulq_f32( S0, R0 );
10181        __n128 P1 = vmulq_f32( R, S1 );
10182        __n128 R1 = vrsqrtsq_f32( P1, S1 );
10183        __n128 S2 = vmulq_f32( S1, R1 );
10184        R = vmulq_f32( R, S2 );
10185        // R = RefractionIndex * IDotN + sqrt(R)
10186        R = vmlaq_f32( R, RefractionIndex, IDotN );
10187        // Result = RefractionIndex * Incident - Normal * R
10188        vResult = vmulq_f32(RefractionIndex, Incident);
10189        vResult = vmlsq_f32( vResult, R, Normal );
10190    }
10191    return vResult;
10192#elif defined(_XM_SSE_INTRINSICS_)
10193    XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
10194
10195    // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
10196    XMVECTOR R = _mm_mul_ps(IDotN,IDotN);
10197    R = _mm_sub_ps(g_XMOne,R);
10198    R = _mm_mul_ps(R, RefractionIndex);
10199    R = _mm_mul_ps(R, RefractionIndex);
10200    R = _mm_sub_ps(g_XMOne,R);
10201
10202    XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
10203    if (_mm_movemask_ps(vResult)==0x0f)
10204    {
10205        // Total internal reflection
10206        vResult = g_XMZero;
10207    }
10208    else
10209    {
10210        // R = RefractionIndex * IDotN + sqrt(R)
10211        R = _mm_sqrt_ps(R);
10212        vResult = _mm_mul_ps(RefractionIndex, IDotN);
10213        R = _mm_add_ps(R,vResult);
10214        // Result = RefractionIndex * Incident - Normal * R
10215        vResult = _mm_mul_ps(RefractionIndex, Incident);
10216        R = _mm_mul_ps(R,Normal);
10217        vResult = _mm_sub_ps(vResult,R);
10218    }
10219    return vResult;
10220#else // _XM_VMX128_INTRINSICS_
10221#endif // _XM_VMX128_INTRINSICS_
10222}
10223
10224//------------------------------------------------------------------------------
10225
10226inline XMVECTOR XMVector4Orthogonal
10227(
10228    FXMVECTOR V
10229)
10230{
10231#if defined(_XM_NO_INTRINSICS_)
10232
10233    XMVECTOR Result;
10234    Result.vector4_f32[0] = V.vector4_f32[2];
10235    Result.vector4_f32[1] = V.vector4_f32[3];
10236    Result.vector4_f32[2] = -V.vector4_f32[0];
10237    Result.vector4_f32[3] = -V.vector4_f32[1];
10238    return Result;
10239
10240#elif defined(_XM_ARM_NEON_INTRINSICS_)
10241    static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f };
10242
10243    __n128 Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) );
10244    return vmulq_f32( Result, Negate );
10245#elif defined(_XM_SSE_INTRINSICS_)
10246    static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f};
10247    XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2));
10248    vResult = _mm_mul_ps(vResult,FlipZW);
10249    return vResult;
10250#else // _XM_VMX128_INTRINSICS_
10251#endif // _XM_VMX128_INTRINSICS_
10252}
10253
10254//------------------------------------------------------------------------------
10255
10256inline XMVECTOR XMVector4AngleBetweenNormalsEst
10257(
10258    FXMVECTOR N1, 
10259    FXMVECTOR N2
10260)
10261{
10262#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
10263
10264    XMVECTOR Result = XMVector4Dot(N1, N2);
10265    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
10266    Result = XMVectorACosEst(Result);
10267    return Result;
10268
10269#else // _XM_VMX128_INTRINSICS_
10270#endif // _XM_VMX128_INTRINSICS_
10271}
10272
10273//------------------------------------------------------------------------------
10274
10275inline XMVECTOR XMVector4AngleBetweenNormals
10276(
10277    FXMVECTOR N1, 
10278    FXMVECTOR N2
10279)
10280{
10281#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
10282
10283    XMVECTOR Result = XMVector4Dot(N1, N2);
10284    Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
10285    Result = XMVectorACos(Result);
10286    return Result;
10287
10288#else // _XM_VMX128_INTRINSICS_
10289#endif // _XM_VMX128_INTRINSICS_
10290}
10291
10292//------------------------------------------------------------------------------
10293
10294inline XMVECTOR XMVector4AngleBetweenVectors
10295(
10296    FXMVECTOR V1, 
10297    FXMVECTOR V2
10298)
10299{
10300#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
10301
10302    XMVECTOR L1 = XMVector4ReciprocalLength(V1);
10303    XMVECTOR L2 = XMVector4ReciprocalLength(V2);
10304
10305    XMVECTOR Dot = XMVector4Dot(V1, V2);
10306
10307    L1 = XMVectorMultiply(L1, L2);
10308
10309    XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
10310    CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
10311
10312    return XMVectorACos(CosAngle);
10313
10314#else // _XM_VMX128_INTRINSICS_
10315#endif // _XM_VMX128_INTRINSICS_
10316}
10317
10318//------------------------------------------------------------------------------
10319
10320inline XMVECTOR XMVector4Transform
10321(
10322    FXMVECTOR V, 
10323    CXMMATRIX M
10324)
10325{
10326#if defined(_XM_NO_INTRINSICS_)
10327    float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
10328    float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
10329    float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
10330    float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
10331    XMVECTOR vResult = {
10332        fX,
10333        fY,
10334        fZ,
10335        fW
10336    };
10337    return vResult;
10338
10339#elif defined(_XM_ARM_NEON_INTRINSICS_)
10340    __n64 VL = vget_low_f32( V );
10341    XMVECTOR vTemp1 = vdupq_lane_f32( VL, 0 ); // X
10342    XMVECTOR vTemp2 = vdupq_lane_f32( VL, 1 ); // Y
10343    XMVECTOR vResult = vmulq_f32( vTemp1, M.r[0] );
10344    vResult = vmlaq_f32( vResult, vTemp2, M.r[1] );
10345    __n64 VH = vget_high_f32( V );
10346    vTemp1 = vdupq_lane_f32( VH, 0 ); // Z
10347    vTemp2 = vdupq_lane_f32( VH, 1 ); // W
10348    vResult = vmlaq_f32( vResult, vTemp1, M.r[2] );
10349    return vmlaq_f32( vResult, vTemp2, M.r[3] );
10350#elif defined(_XM_SSE_INTRINSICS_)
10351    // Splat x,y,z and w
10352    XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
10353    XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
10354    XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
10355    XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
10356    // Mul by the matrix
10357    vTempX = _mm_mul_ps(vTempX,M.r[0]);
10358    vTempY = _mm_mul_ps(vTempY,M.r[1]);
10359    vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
10360    vTempW = _mm_mul_ps(vTempW,M.r[3]);
10361    // Add them all together
10362    vTempX = _mm_add_ps(vTempX,vTempY);
10363    vTempZ = _mm_add_ps(vTempZ,vTempW);
10364    vTempX = _mm_add_ps(vTempX,vTempZ);
10365    return vTempX;
10366#else // _XM_VMX128_INTRINSICS_
10367#endif // _XM_VMX128_INTRINSICS_
10368}
10369
10370//------------------------------------------------------------------------------
10371_Use_decl_annotations_
10372inline XMFLOAT4* XMVector4TransformStream
10373(
10374    XMFLOAT4*       pOutputStream, 
10375    size_t          OutputStride, 
10376    const XMFLOAT4* pInputStream, 
10377    size_t          InputStride, 
10378    size_t          VectorCount, 
10379    CXMMATRIX       M
10380)
10381{
10382    assert(pOutputStream != NULL);
10383    assert(pInputStream != NULL);
10384
10385#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_)
10386
10387    const uint8_t* pInputVector = (const uint8_t*)pInputStream;
10388    uint8_t* pOutputVector = (uint8_t*)pOutputStream;
10389
10390    const XMVECTOR row0 = M.r[0];
10391    const XMVECTOR row1 = M.r[1];
10392    const XMVECTOR row2 = M.r[2];
10393    const XMVECTOR row3 = M.r[3];
10394
10395    for (size_t i = 0; i < VectorCount; i++)
10396    {
10397        XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector);
10398        XMVECTOR W = XMVectorSplatW(V);
10399        XMVECTOR Z = XMVectorSplatZ(V);
10400        XMVECTOR Y = XMVectorSplatY(V);
10401        XMVECTOR X = XMVectorSplatX(V);
10402
10403        XMVECTOR Result = XMVectorMultiply(W, row3);
10404        Result = XMVectorMultiplyAdd(Z, row2, Result);
10405        Result = XMVectorMultiplyAdd(Y, row1, Result);
10406        Result = XMVectorMultiplyAdd(X, row0, Result);
10407
10408        XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
10409
10410        pInputVector += InputStride; 
10411        pOutputVector += OutputStride;
10412    }
10413
10414    return pOutputStream;
10415
10416#else // _XM_VMX128_INTRINSICS_
10417#endif // _XM_VMX128_INTRINSICS_
10418}
10419
10420/****************************************************************************
10421 *
10422 * XMVECTOR operators
10423 *
10424 ****************************************************************************/
10425
10426//------------------------------------------------------------------------------
10427
10428inline XMVECTOR operator+ (FXMVECTOR V)
10429{
10430    return V;
10431}
10432
10433//------------------------------------------------------------------------------
10434
10435inline XMVECTOR operator- (FXMVECTOR V)
10436{
10437    return XMVectorNegate(V);
10438}
10439
10440//------------------------------------------------------------------------------
10441
10442inline XMVECTOR& operator+=
10443(
10444    XMVECTOR&       V1,
10445    FXMVECTOR       V2
10446)
10447{
10448    V1 = XMVectorAdd(V1, V2);
10449    return V1;
10450}
10451
10452//------------------------------------------------------------------------------
10453
10454inline XMVECTOR& operator-=
10455(
10456    XMVECTOR&       V1,
10457    FXMVECTOR       V2
10458)
10459{
10460    V1 = XMVectorSubtract(V1, V2);
10461    return V1;
10462}
10463
10464//------------------------------------------------------------------------------
10465
10466inline XMVECTOR& operator*=
10467(
10468    XMVECTOR&       V1,
10469    FXMVECTOR       V2
10470)
10471{
10472    V1 = XMVectorMultiply(V1, V2);
10473    return V1;
10474}
10475
10476//------------------------------------------------------------------------------
10477
10478inline XMVECTOR& operator/=
10479(
10480    XMVECTOR&       V1,
10481    FXMVECTOR       V2
10482)
10483{
10484    V1 = XMVectorDivide(V1,V2);
10485    return V1;
10486}
10487
10488//------------------------------------------------------------------------------
10489
10490inline XMVECTOR& operator*=
10491(
10492    XMVECTOR&   V,
10493    const float S
10494)
10495{
10496    V = XMVectorScale(V, S);
10497    return V;
10498}
10499
10500//------------------------------------------------------------------------------
10501
10502inline XMVECTOR& operator/=
10503(
10504    XMVECTOR&   V,
10505    const float S
10506)
10507{
10508    assert( S != 0.0f );
10509    V = XMVectorScale(V, 1.0f / S);
10510    return V;
10511}
10512
10513//------------------------------------------------------------------------------
10514
10515inline XMVECTOR operator+
10516(
10517    FXMVECTOR V1,
10518    FXMVECTOR V2
10519)
10520{
10521    return XMVectorAdd(V1, V2);
10522}
10523
10524//------------------------------------------------------------------------------
10525
10526inline XMVECTOR operator-
10527(
10528    FXMVECTOR V1,
10529    FXMVECTOR V2
10530)
10531{
10532    return XMVectorSubtract(V1, V2);
10533}
10534
10535//------------------------------------------------------------------------------
10536
10537inline XMVECTOR operator*
10538(
10539    FXMVECTOR V1,
10540    FXMVECTOR V2
10541)
10542{
10543    return XMVectorMultiply(V1, V2);
10544}
10545
10546//------------------------------------------------------------------------------
10547
10548inline XMVECTOR operator/
10549(
10550    FXMVECTOR V1,
10551    FXMVECTOR V2
10552)
10553{
10554    return XMVectorDivide(V1,V2);
10555}
10556
10557//------------------------------------------------------------------------------
10558
10559inline XMVECTOR operator*
10560(
10561    FXMVECTOR      V,
10562    const float    S
10563)
10564{
10565    return XMVectorScale(V, S);
10566}
10567
10568//------------------------------------------------------------------------------
10569
10570inline XMVECTOR operator/
10571(
10572    FXMVECTOR      V,
10573    const float    S
10574)
10575{
10576    assert( S != 0.0f );
10577    return XMVectorScale(V, 1.0f / S);
10578}
10579
10580//------------------------------------------------------------------------------
10581
10582inline XMVECTOR operator*
10583(
10584    float           S,
10585    FXMVECTOR  	    V
10586)
10587{
10588    return XMVectorScale(V, S);
10589}
10590
10591#if defined(_XM_NO_INTRINSICS_)
10592#undef XMISNAN
10593#undef XMISINF
10594#endif
10595
10596