Minecraft.Client/PS3/PS3Extras/DirectX/DirectXMathConvert.inl at main

ocrm.bsky.social / VoxelBlockGame
fork atom
the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
fork atom
VoxelBlockGame / Minecraft.Client / PS3 / PS3Extras / DirectX / DirectXMathConvert.inl
at main 1962 lines 67 kB view raw
wrap content
daoge_cmd Initial commit 17d ago
b691c43c
   1//-------------------------------------------------------------------------------------
   2// DirectXMathConvert.inl -- SIMD C++ Math library
   3//
   4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
   5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
   6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
   7// PARTICULAR PURPOSE.
   8//  
   9// Copyright (c) Microsoft Corporation. All rights reserved.
  10//-------------------------------------------------------------------------------------
  11
  12#ifdef _MSC_VER
  13#pragma once
  14#endif
  15
  16/****************************************************************************
  17 *
  18 * Data conversion
  19 *
  20 ****************************************************************************/
  21
  22//------------------------------------------------------------------------------
  23
  24#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
  25// For VMX128, these routines are all defines in the main header
  26
  27#pragma warning(push)
  28#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
  29
  30inline XMVECTOR XMConvertVectorIntToFloat
  31(
  32    FXMVECTOR    VInt,
  33    uint32_t     DivExponent
  34)
  35{
  36    assert(DivExponent<32);
  37#if defined(_XM_NO_INTRINSICS_)
  38    float fScale = 1.0f / (float)(1U << DivExponent);
  39    uint32_t ElementIndex = 0;
  40    XMVECTOR Result;
  41    do {
  42        int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex];
  43        Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale;
  44    } while (++ElementIndex<4);
  45    return Result;
  46#elif defined(_XM_ARM_NEON_INTRINSICS_)
  47    __n128 vResult = vcvtq_f32_s32( VInt );
  48    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
  49    __n128 vScale = vdupq_n_u32( uScale );
  50    return vmulq_f32( vResult, vScale );
  51#else // _XM_SSE_INTRINSICS_
  52    // Convert to floats
  53    XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
  54    // Convert DivExponent into 1.0f/(1<<DivExponent)
  55    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
  56    // Splat the scalar value
  57    __m128i vScale = _mm_set1_epi32(uScale);
  58    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
  59    return vResult;
  60#endif
  61}
  62
  63//------------------------------------------------------------------------------
  64
  65inline XMVECTOR XMConvertVectorFloatToInt
  66(
  67    FXMVECTOR    VFloat,
  68    uint32_t     MulExponent
  69)
  70{
  71    assert(MulExponent<32);
  72#if defined(_XM_NO_INTRINSICS_)
  73    // Get the scalar factor.
  74    float fScale = (float)(1U << MulExponent);
  75    uint32_t ElementIndex = 0;
  76    XMVECTOR Result;
  77    do {
  78        int32_t iResult;
  79        float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
  80        if (fTemp <= -(65536.0f*32768.0f)) {
  81            iResult = (-0x7FFFFFFF)-1;
  82        } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
  83            iResult = 0x7FFFFFFF;
  84        } else {
  85            iResult = (int32_t)fTemp;
  86        }
  87        Result.vector4_u32[ElementIndex] = (uint32_t)iResult;
  88    } while (++ElementIndex<4);
  89    return Result;
  90#elif defined(_XM_ARM_NEON_INTRINSICS_)
  91    __n128 vResult = vdupq_n_f32((float)(1U << MulExponent));
  92    vResult = vmulq_f32(vResult,VFloat);
  93    // In case of positive overflow, detect it
  94    __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxInt);
  95    // Float to int conversion
  96    __n128 vResulti = vcvtq_s32_f32(vResult);
  97    // If there was positive overflow, set to 0x7FFFFFFF
  98    vResult = vandq_u32(vOverflow,g_XMAbsMask);
  99    vOverflow = vbicq_u32(vResulti,vOverflow);
 100    vOverflow = vorrq_u32(vOverflow,vResult);
 101    return vOverflow;
 102#else // _XM_SSE_INTRINSICS_
 103    XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent));
 104    vResult = _mm_mul_ps(vResult,VFloat);
 105    // In case of positive overflow, detect it
 106    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
 107    // Float to int conversion
 108    __m128i vResulti = _mm_cvttps_epi32(vResult);
 109    // If there was positive overflow, set to 0x7FFFFFFF
 110    vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
 111    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
 112    vOverflow = _mm_or_ps(vOverflow,vResult);
 113    return vOverflow;
 114#endif
 115}
 116
 117//------------------------------------------------------------------------------
 118
 119inline XMVECTOR XMConvertVectorUIntToFloat
 120(
 121    FXMVECTOR     VUInt,
 122    uint32_t      DivExponent
 123)
 124{
 125    assert(DivExponent<32);
 126#if defined(_XM_NO_INTRINSICS_)
 127    float fScale = 1.0f / (float)(1U << DivExponent);
 128    uint32_t ElementIndex = 0;
 129    XMVECTOR Result;
 130    do {
 131        Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale;
 132    } while (++ElementIndex<4);
 133    return Result;
 134#elif defined(_XM_ARM_NEON_INTRINSICS_)
 135    __n128 vResult = vcvtq_f32_u32( VUInt );
 136    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
 137    __n128 vScale = vdupq_n_u32( uScale );
 138    return vmulq_f32( vResult, vScale );
 139#else // _XM_SSE_INTRINSICS_
 140    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
 141    // Determine which ones need the fix.
 142    XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
 143    // Force all values positive
 144    XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
 145    // Convert to floats
 146    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
 147    // Convert 0x80000000 -> 0xFFFFFFFF
 148    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
 149    // For only the ones that are too big, add the fixup
 150    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
 151    vResult = _mm_add_ps(vResult,vMask);
 152    // Convert DivExponent into 1.0f/(1<<DivExponent)
 153    uint32_t uScale = 0x3F800000U - (DivExponent << 23);
 154    // Splat
 155    iMask = _mm_set1_epi32(uScale);
 156    vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
 157    return vResult;
 158#endif
 159}
 160
 161//------------------------------------------------------------------------------
 162
 163inline XMVECTOR XMConvertVectorFloatToUInt
 164(
 165    FXMVECTOR     VFloat,
 166    uint32_t      MulExponent
 167)
 168{
 169    assert(MulExponent<32);
 170#if defined(_XM_NO_INTRINSICS_)
 171    // Get the scalar factor.
 172    float fScale = (float)(1U << MulExponent);
 173    uint32_t ElementIndex = 0;
 174    XMVECTOR Result;
 175    do {
 176        uint32_t uResult;
 177        float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
 178        if (fTemp <= 0.0f) {
 179            uResult = 0;
 180        } else if (fTemp >= (65536.0f*65536.0f)) {
 181            uResult = 0xFFFFFFFFU;
 182        } else {
 183            uResult = (uint32_t)fTemp;
 184        }
 185        Result.vector4_u32[ElementIndex] = uResult;
 186    } while (++ElementIndex<4);
 187    return Result;
 188#elif defined(_XM_ARM_NEON_INTRINSICS_)
 189    __n128 vResult = vdupq_n_f32((float)(1U << MulExponent));
 190    vResult = vmulq_f32(vResult,VFloat);
 191    // In case of overflow, detect it
 192    __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxUInt);
 193    // Float to int conversion
 194    __n128 vResulti = vcvtq_u32_f32(vResult);
 195    // If there was overflow, set to 0xFFFFFFFFU
 196    vResult = vbicq_u32(vResulti,vOverflow);
 197    vOverflow = vorrq_u32(vOverflow,vResult);
 198    return vOverflow;
 199#else // _XM_SSE_INTRINSICS_
 200    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
 201    vResult = _mm_mul_ps(vResult,VFloat);
 202    // Clamp to >=0
 203    vResult = _mm_max_ps(vResult,g_XMZero);
 204    // Any numbers that are too big, set to 0xFFFFFFFFU
 205    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
 206    XMVECTOR vValue = g_XMUnsignedFix;
 207    // Too large for a signed integer?
 208    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
 209    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
 210    vValue = _mm_and_ps(vValue,vMask);
 211    // Perform fixup only on numbers too large (Keeps low bit precision)
 212    vResult = _mm_sub_ps(vResult,vValue);
 213    __m128i vResulti = _mm_cvttps_epi32(vResult);
 214    // Convert from signed to unsigned pnly if greater than 0x80000000
 215    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
 216    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
 217    // On those that are too large, set to 0xFFFFFFFF
 218    vResult = _mm_or_ps(vResult,vOverflow);
 219    return vResult;
 220#endif
 221}
 222
 223#pragma warning(pop)
 224
 225#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_
 226
 227/****************************************************************************
 228 *
 229 * Vector and matrix load operations
 230 *
 231 ****************************************************************************/
 232
 233//------------------------------------------------------------------------------
 234_Use_decl_annotations_
 235inline XMVECTOR XMLoadInt(const uint32_t* pSource)
 236{
 237    assert(pSource);
 238#if defined(_XM_NO_INTRINSICS_)
 239    XMVECTOR V;
 240    V.vector4_u32[0] = *pSource;
 241    V.vector4_u32[1] = 0;
 242    V.vector4_u32[2] = 0;
 243    V.vector4_u32[3] = 0;
 244    return V;
 245#elif defined(_XM_ARM_NEON_INTRINSICS_)
 246    __n128 zero = vdupq_n_u32(0);
 247    return vld1q_lane_u32( pSource, zero, 0 );
 248#elif defined(_XM_SSE_INTRINSICS_)
 249    return _mm_load_ss( reinterpret_cast<const float*>(pSource) );
 250#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 251#endif // _XM_VMX128_INTRINSICS_
 252}
 253
 254//------------------------------------------------------------------------------
 255_Use_decl_annotations_
 256inline XMVECTOR XMLoadFloat(const float* pSource)
 257{
 258    assert(pSource);
 259#if defined(_XM_NO_INTRINSICS_)
 260    XMVECTOR V;
 261    V.vector4_f32[0] = *pSource;
 262    V.vector4_f32[1] = 0.f;
 263    V.vector4_f32[2] = 0.f;
 264    V.vector4_f32[3] = 0.f;
 265    return V;
 266#elif defined(_XM_ARM_NEON_INTRINSICS_)
 267    __n128 zero = vdupq_n_u32(0);
 268    return vld1q_lane_f32( pSource, zero, 0 );
 269#elif defined(_XM_SSE_INTRINSICS_)
 270    return _mm_load_ss( pSource );
 271#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 272#endif // _XM_VMX128_INTRINSICS_
 273}
 274
 275//------------------------------------------------------------------------------
 276_Use_decl_annotations_
 277inline XMVECTOR XMLoadInt2
 278(
 279    const uint32_t* pSource
 280)
 281{
 282    assert(pSource);
 283#if defined(_XM_NO_INTRINSICS_)
 284    XMVECTOR V;
 285    V.vector4_u32[0] = pSource[0];
 286    V.vector4_u32[1] = pSource[1];
 287    V.vector4_u32[2] = 0;
 288    V.vector4_u32[3] = 0;
 289    return V;
 290#elif defined(_XM_ARM_NEON_INTRINSICS_)
 291    __n64 x = vld1_u32( pSource );
 292    __n64 zero = vdup_n_u32(0);
 293    return vcombine_u32( x, zero );
 294#elif defined(_XM_SSE_INTRINSICS_)
 295    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
 296    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
 297    return _mm_unpacklo_ps( x, y );
 298#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 299#endif // _XM_VMX128_INTRINSICS_
 300}
 301
 302//------------------------------------------------------------------------------
 303_Use_decl_annotations_
 304inline XMVECTOR XMLoadInt2A
 305(
 306    const uint32_t* pSource
 307)
 308{
 309    assert(pSource);
 310    assert(((uintptr_t)pSource & 0xF) == 0);
 311#if defined(_XM_NO_INTRINSICS_)
 312    XMVECTOR V;
 313    V.vector4_u32[0] = pSource[0];
 314    V.vector4_u32[1] = pSource[1];
 315    V.vector4_u32[2] = 0;
 316    V.vector4_u32[3] = 0;
 317    return V;
 318#elif defined(_XM_ARM_NEON_INTRINSICS_)
 319    __n64 x = vld1_u32_ex( pSource, 64 );
 320    __n64 zero = vdup_n_u32(0);
 321    return vcombine_u32( x, zero );
 322#elif defined(_XM_SSE_INTRINSICS_)
 323    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
 324    return reinterpret_cast<__m128 *>(&V)[0];
 325#else // _XM_VMX128_INTRINSICS_
 326#endif // _XM_VMX128_INTRINSICS_
 327}
 328
 329//------------------------------------------------------------------------------
 330_Use_decl_annotations_
 331inline XMVECTOR XMLoadFloat2
 332(
 333    const XMFLOAT2* pSource
 334)
 335{
 336    assert(pSource);
 337#if defined(_XM_NO_INTRINSICS_)
 338    XMVECTOR V;
 339    V.vector4_f32[0] = pSource->x;
 340    V.vector4_f32[1] = pSource->y;
 341    V.vector4_f32[2] = 0.f;
 342    V.vector4_f32[3] = 0.f;
 343    return V;
 344#elif defined(_XM_ARM_NEON_INTRINSICS_)
 345    __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) );
 346    __n64 zero = vdup_n_u32(0);
 347    return vcombine_f32( x, zero );
 348#elif defined(_XM_SSE_INTRINSICS_)
 349    __m128 x = _mm_load_ss( &pSource->x );
 350    __m128 y = _mm_load_ss( &pSource->y );
 351    return _mm_unpacklo_ps( x, y );
 352#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 353#endif // _XM_VMX128_INTRINSICS_
 354}
 355
 356//------------------------------------------------------------------------------
 357_Use_decl_annotations_
 358inline XMVECTOR XMLoadFloat2A
 359(
 360    const XMFLOAT2A* pSource
 361)
 362{
 363    assert(pSource);
 364    assert(((uintptr_t)pSource & 0xF) == 0);
 365#if defined(_XM_NO_INTRINSICS_)
 366    XMVECTOR V;
 367    V.vector4_f32[0] = pSource->x;
 368    V.vector4_f32[1] = pSource->y;
 369    V.vector4_f32[2] = 0.f;
 370    V.vector4_f32[3] = 0.f;
 371    return V;
 372#elif defined(_XM_ARM_NEON_INTRINSICS_)
 373    __n64 x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 );
 374    __n64 zero = vdup_n_u32(0);
 375    return vcombine_f32( x, zero );
 376#elif defined(_XM_SSE_INTRINSICS_)
 377    __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
 378    return reinterpret_cast<__m128 *>(&V)[0];
 379#else // _XM_VMX128_INTRINSICS_
 380#endif // _XM_VMX128_INTRINSICS_
 381}
 382
 383//------------------------------------------------------------------------------
 384_Use_decl_annotations_
 385inline XMVECTOR XMLoadSInt2
 386(
 387    const XMINT2* pSource
 388)
 389{
 390    assert(pSource);
 391#if defined(_XM_NO_INTRINSICS_)
 392    XMVECTOR V;
 393    V.vector4_f32[0] = (float)pSource->x;
 394    V.vector4_f32[1] = (float)pSource->y;
 395    V.vector4_f32[2] = 0.f;
 396    V.vector4_f32[3] = 0.f;
 397    return V;
 398#elif defined(_XM_ARM_NEON_INTRINSICS_)
 399    __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
 400    __n64 v = vcvt_f32_s32( x );
 401    __n64 zero = vdup_n_u32(0);
 402    return vcombine_s32( v, zero );
 403#elif defined(_XM_SSE_INTRINSICS_)
 404    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
 405    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
 406    __m128 V = _mm_unpacklo_ps( x, y );
 407    return _mm_cvtepi32_ps(_mm_castps_si128(V));
 408#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 409#endif // _XM_VMX128_INTRINSICS_
 410}
 411
 412//------------------------------------------------------------------------------
 413_Use_decl_annotations_
 414inline XMVECTOR XMLoadUInt2
 415(
 416    const XMUINT2* pSource
 417)
 418{
 419    assert(pSource);
 420#if defined(_XM_NO_INTRINSICS_)
 421    XMVECTOR V;
 422    V.vector4_f32[0] = (float)pSource->x;
 423    V.vector4_f32[1] = (float)pSource->y;
 424    V.vector4_f32[2] = 0.f;
 425    V.vector4_f32[3] = 0.f;
 426    return V;
 427#elif defined(_XM_ARM_NEON_INTRINSICS_)
 428    __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
 429    __n64 v = vcvt_f32_u32( x );
 430    __n64 zero = vdup_n_u32(0);
 431    return vcombine_u32( v, zero );
 432#elif defined(_XM_SSE_INTRINSICS_)
 433    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
 434    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
 435    __m128 V = _mm_unpacklo_ps( x, y );
 436    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
 437    // Determine which ones need the fix.
 438    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
 439    // Force all values positive
 440    XMVECTOR vResult = _mm_xor_ps(V,vMask);
 441    // Convert to floats
 442    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
 443    // Convert 0x80000000 -> 0xFFFFFFFF
 444    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
 445    // For only the ones that are too big, add the fixup
 446    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
 447    vResult = _mm_add_ps(vResult,vMask);
 448    return vResult;
 449#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 450#endif // _XM_VMX128_INTRINSICS_
 451}
 452
 453//------------------------------------------------------------------------------
 454_Use_decl_annotations_
 455inline XMVECTOR XMLoadInt3
 456(
 457    const uint32_t* pSource
 458)
 459{
 460    assert(pSource);
 461#if defined(_XM_NO_INTRINSICS_)
 462    XMVECTOR V;
 463    V.vector4_u32[0] = pSource[0];
 464    V.vector4_u32[1] = pSource[1];
 465    V.vector4_u32[2] = pSource[2];
 466    V.vector4_u32[3] = 0;
 467    return V;
 468#elif defined(_XM_ARM_NEON_INTRINSICS_)
 469    __n64 x = vld1_u32( pSource );
 470    __n64 zero = vdup_n_u32(0);
 471    __n64 y = vld1_lane_u32( pSource+2, zero, 0 );
 472    return vcombine_u32( x, y );
 473#elif defined(_XM_SSE_INTRINSICS_)
 474    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
 475    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
 476    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
 477    __m128 xy = _mm_unpacklo_ps( x, y );
 478    return _mm_movelh_ps( xy, z );
 479#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 480#endif // _XM_VMX128_INTRINSICS_
 481}
 482
 483//------------------------------------------------------------------------------
 484_Use_decl_annotations_
 485inline XMVECTOR XMLoadInt3A
 486(
 487    const uint32_t* pSource
 488)
 489{
 490    assert(pSource);
 491    assert(((uintptr_t)pSource & 0xF) == 0);
 492#if defined(_XM_NO_INTRINSICS_)
 493    XMVECTOR V;
 494    V.vector4_u32[0] = pSource[0];
 495    V.vector4_u32[1] = pSource[1];
 496    V.vector4_u32[2] = pSource[2];
 497    V.vector4_u32[3] = 0;
 498    return V;
 499#elif defined(_XM_ARM_NEON_INTRINSICS_)
 500    // Reads an extra integer which is zero'd
 501    __n128 V = vld1q_u32_ex( pSource, 128 );
 502    return vsetq_lane_u32( 0, V, 3 );
 503#elif defined(_XM_SSE_INTRINSICS_)
 504    // Reads an extra integer which is zero'd
 505    __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
 506    V = _mm_and_si128( V, g_XMMask3 );
 507    return reinterpret_cast<__m128 *>(&V)[0];
 508#else // _XM_VMX128_INTRINSICS_
 509#endif // _XM_VMX128_INTRINSICS_
 510}
 511
 512//------------------------------------------------------------------------------
 513_Use_decl_annotations_
 514inline XMVECTOR XMLoadFloat3
 515(
 516    const XMFLOAT3* pSource
 517)
 518{
 519    assert(pSource);
 520#if defined(_XM_NO_INTRINSICS_)
 521    XMVECTOR V;
 522    V.vector4_f32[0] = pSource->x;
 523    V.vector4_f32[1] = pSource->y;
 524    V.vector4_f32[2] = pSource->z;
 525    V.vector4_f32[3] = 0.f;
 526    return V;
 527#elif defined(_XM_ARM_NEON_INTRINSICS_)
 528    __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) );
 529    __n64 zero = vdup_n_u32(0);
 530    __n64 y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
 531    return vcombine_f32( x, y );
 532#elif defined(_XM_SSE_INTRINSICS_)
 533    __m128 x = _mm_load_ss( &pSource->x );
 534    __m128 y = _mm_load_ss( &pSource->y );
 535    __m128 z = _mm_load_ss( &pSource->z );
 536    __m128 xy = _mm_unpacklo_ps( x, y );
 537    return _mm_movelh_ps( xy, z );
 538#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 539#endif // _XM_VMX128_INTRINSICS_
 540}
 541
 542//------------------------------------------------------------------------------
 543_Use_decl_annotations_
 544inline XMVECTOR XMLoadFloat3A
 545(
 546    const XMFLOAT3A* pSource
 547)
 548{
 549    assert(pSource);
 550    assert(((uintptr_t)pSource & 0xF) == 0);
 551#if defined(_XM_NO_INTRINSICS_)
 552    XMVECTOR V;
 553    V.vector4_f32[0] = pSource->x;
 554    V.vector4_f32[1] = pSource->y;
 555    V.vector4_f32[2] = pSource->z;
 556    V.vector4_f32[3] = 0.f;
 557    return V;
 558#elif defined(_XM_ARM_NEON_INTRINSICS_)
 559    // Reads an extra float which is zero'd
 560    __n128 V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
 561    return vsetq_lane_f32( 0, V, 3 );
 562#elif defined(_XM_SSE_INTRINSICS_)
 563    // Reads an extra float which is zero'd
 564    __m128 V = _mm_load_ps( &pSource->x );
 565    return _mm_and_ps( V, g_XMMask3 );
 566#else // _XM_VMX128_INTRINSICS_
 567#endif // _XM_VMX128_INTRINSICS_
 568}
 569
 570//------------------------------------------------------------------------------
 571_Use_decl_annotations_
 572inline XMVECTOR XMLoadSInt3
 573(
 574    const XMINT3* pSource
 575)
 576{
 577    assert(pSource);
 578#if defined(_XM_NO_INTRINSICS_)
 579
 580    XMVECTOR V;
 581    V.vector4_f32[0] = (float)pSource->x;
 582    V.vector4_f32[1] = (float)pSource->y;
 583    V.vector4_f32[2] = (float)pSource->z;
 584    V.vector4_f32[3] = 0.f;
 585    return V;
 586
 587#elif defined(_XM_ARM_NEON_INTRINSICS_)
 588    __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
 589    __n64 zero = vdup_n_u32(0);
 590    __n64 y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 );
 591    __n128 v = vcombine_s32( x, y );
 592    return vcvtq_f32_s32( v );
 593#elif defined(_XM_SSE_INTRINSICS_)
 594    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
 595    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
 596    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
 597    __m128 xy = _mm_unpacklo_ps( x, y );
 598    __m128 V = _mm_movelh_ps( xy, z );
 599    return _mm_cvtepi32_ps(_mm_castps_si128(V));
 600#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 601#endif // _XM_VMX128_INTRINSICS_
 602}
 603
 604//------------------------------------------------------------------------------
 605_Use_decl_annotations_
 606inline XMVECTOR XMLoadUInt3
 607(
 608    const XMUINT3* pSource
 609)
 610{
 611    assert(pSource);
 612#if defined(_XM_NO_INTRINSICS_)
 613    XMVECTOR V;
 614    V.vector4_f32[0] = (float)pSource->x;
 615    V.vector4_f32[1] = (float)pSource->y;
 616    V.vector4_f32[2] = (float)pSource->z;
 617    V.vector4_f32[3] = 0.f;
 618    return V;
 619#elif defined(_XM_ARM_NEON_INTRINSICS_)
 620    __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
 621    __n64 zero = vdup_n_u32(0);
 622    __n64 y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 );
 623    __n128 v = vcombine_u32( x, y );
 624    return vcvtq_f32_u32( v );
 625#elif defined(_XM_SSE_INTRINSICS_)
 626    __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
 627    __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
 628    __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
 629    __m128 xy = _mm_unpacklo_ps( x, y );
 630    __m128 V = _mm_movelh_ps( xy, z );
 631    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
 632    // Determine which ones need the fix.
 633    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
 634    // Force all values positive
 635    XMVECTOR vResult = _mm_xor_ps(V,vMask);
 636    // Convert to floats
 637    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
 638    // Convert 0x80000000 -> 0xFFFFFFFF
 639    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
 640    // For only the ones that are too big, add the fixup
 641    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
 642    vResult = _mm_add_ps(vResult,vMask);
 643    return vResult; 
 644
 645#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 646#endif // _XM_VMX128_INTRINSICS_
 647}
 648
 649//------------------------------------------------------------------------------
 650_Use_decl_annotations_
 651inline XMVECTOR XMLoadInt4
 652(
 653    const uint32_t* pSource
 654)
 655{
 656    assert(pSource);
 657
 658#if defined(_XM_NO_INTRINSICS_)
 659    XMVECTOR V;
 660    V.vector4_u32[0] = pSource[0];
 661    V.vector4_u32[1] = pSource[1];
 662    V.vector4_u32[2] = pSource[2];
 663    V.vector4_u32[3] = pSource[3];
 664    return V;
 665#elif defined(_XM_ARM_NEON_INTRINSICS_)
 666    return vld1q_u32( pSource );
 667#elif defined(_XM_SSE_INTRINSICS_)
 668    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
 669    return reinterpret_cast<__m128 *>(&V)[0];
 670#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 671#endif // _XM_VMX128_INTRINSICS_
 672}
 673
 674//------------------------------------------------------------------------------
 675_Use_decl_annotations_
 676inline XMVECTOR XMLoadInt4A
 677(
 678    const uint32_t* pSource
 679)
 680{
 681    assert(pSource);
 682    assert(((uintptr_t)pSource & 0xF) == 0);
 683#if defined(_XM_NO_INTRINSICS_)
 684    XMVECTOR V;
 685    V.vector4_u32[0] = pSource[0];
 686    V.vector4_u32[1] = pSource[1];
 687    V.vector4_u32[2] = pSource[2];
 688    V.vector4_u32[3] = pSource[3];
 689    return V;
 690#elif defined(_XM_ARM_NEON_INTRINSICS_)
 691    return vld1q_u32_ex( pSource, 128 );
 692#elif defined(_XM_SSE_INTRINSICS_)
 693    __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
 694    return reinterpret_cast<__m128 *>(&V)[0];
 695#else // _XM_VMX128_INTRINSICS_
 696#endif // _XM_VMX128_INTRINSICS_
 697}
 698
 699//------------------------------------------------------------------------------
 700_Use_decl_annotations_
 701inline XMVECTOR XMLoadFloat4
 702(
 703    const XMFLOAT4* pSource
 704)
 705{
 706    assert(pSource);
 707#if defined(_XM_NO_INTRINSICS_)
 708    XMVECTOR V;
 709    V.vector4_f32[0] = pSource->x;
 710    V.vector4_f32[1] = pSource->y;
 711    V.vector4_f32[2] = pSource->z;
 712    V.vector4_f32[3] = pSource->w;
 713    return V;
 714#elif defined(_XM_ARM_NEON_INTRINSICS_)
 715    return vld1q_f32( reinterpret_cast<const float*>(pSource) );
 716#elif defined(_XM_SSE_INTRINSICS_)
 717    return _mm_loadu_ps( &pSource->x );
 718#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 719#endif // _XM_VMX128_INTRINSICS_
 720}
 721
 722//------------------------------------------------------------------------------
 723_Use_decl_annotations_
 724inline XMVECTOR XMLoadFloat4A
 725(
 726    const XMFLOAT4A* pSource
 727)
 728{
 729    assert(pSource);
 730    assert(((uintptr_t)pSource & 0xF) == 0);
 731#if defined(_XM_NO_INTRINSICS_)
 732    XMVECTOR V;
 733    V.vector4_f32[0] = pSource->x;
 734    V.vector4_f32[1] = pSource->y;
 735    V.vector4_f32[2] = pSource->z;
 736    V.vector4_f32[3] = pSource->w;
 737    return V;
 738#elif defined(_XM_ARM_NEON_INTRINSICS_)
 739    return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
 740#elif defined(_XM_SSE_INTRINSICS_)
 741    return _mm_load_ps( &pSource->x );
 742#else // _XM_VMX128_INTRINSICS_
 743#endif // _XM_VMX128_INTRINSICS_
 744}
 745
 746//------------------------------------------------------------------------------
 747_Use_decl_annotations_
 748inline XMVECTOR XMLoadSInt4
 749(
 750    const XMINT4* pSource
 751)
 752{
 753    assert(pSource);
 754#if defined(_XM_NO_INTRINSICS_)
 755
 756    XMVECTOR V;
 757    V.vector4_f32[0] = (float)pSource->x;
 758    V.vector4_f32[1] = (float)pSource->y;
 759    V.vector4_f32[2] = (float)pSource->z;
 760    V.vector4_f32[3] = (float)pSource->w;
 761    return V;
 762
 763#elif defined(_XM_ARM_NEON_INTRINSICS_)
 764    __n128 v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) );
 765    return vcvtq_f32_s32( v );
 766#elif defined(_XM_SSE_INTRINSICS_)
 767    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
 768    return _mm_cvtepi32_ps(V);
 769#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 770#endif // _XM_VMX128_INTRINSICS_
 771}
 772
 773//------------------------------------------------------------------------------
 774_Use_decl_annotations_
 775inline XMVECTOR XMLoadUInt4
 776(
 777    const XMUINT4* pSource
 778)
 779{
 780    assert(pSource);
 781#if defined(_XM_NO_INTRINSICS_)
 782    XMVECTOR V;
 783    V.vector4_f32[0] = (float)pSource->x;
 784    V.vector4_f32[1] = (float)pSource->y;
 785    V.vector4_f32[2] = (float)pSource->z;
 786    V.vector4_f32[3] = (float)pSource->w;
 787    return V;
 788#elif defined(_XM_ARM_NEON_INTRINSICS_)
 789    __n128 v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) );
 790    return vcvtq_f32_u32( v );
 791#elif defined(_XM_SSE_INTRINSICS_)
 792    __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
 793    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
 794    // Determine which ones need the fix.
 795    XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
 796    // Force all values positive
 797    XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
 798    // Convert to floats
 799    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
 800    // Convert 0x80000000 -> 0xFFFFFFFF
 801    __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
 802    // For only the ones that are too big, add the fixup
 803    vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
 804    vResult = _mm_add_ps(vResult,vMask);
 805    return vResult;
 806#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 807#endif // _XM_VMX128_INTRINSICS_
 808}
 809
 810//------------------------------------------------------------------------------
 811_Use_decl_annotations_
 812inline XMMATRIX XMLoadFloat3x3
 813(
 814    const XMFLOAT3X3* pSource
 815)
 816{
 817    assert(pSource);
 818#if defined(_XM_NO_INTRINSICS_)
 819
 820    XMMATRIX M;
 821    M.r[0].vector4_f32[0] = pSource->m[0][0];
 822    M.r[0].vector4_f32[1] = pSource->m[0][1];
 823    M.r[0].vector4_f32[2] = pSource->m[0][2];
 824    M.r[0].vector4_f32[3] = 0.0f;
 825
 826    M.r[1].vector4_f32[0] = pSource->m[1][0];
 827    M.r[1].vector4_f32[1] = pSource->m[1][1];
 828    M.r[1].vector4_f32[2] = pSource->m[1][2];
 829    M.r[1].vector4_f32[3] = 0.0f;
 830
 831    M.r[2].vector4_f32[0] = pSource->m[2][0];
 832    M.r[2].vector4_f32[1] = pSource->m[2][1];
 833    M.r[2].vector4_f32[2] = pSource->m[2][2];
 834    M.r[2].vector4_f32[3] = 0.0f;
 835    M.r[3].vector4_f32[0] = 0.0f;
 836    M.r[3].vector4_f32[1] = 0.0f;
 837    M.r[3].vector4_f32[2] = 0.0f;
 838    M.r[3].vector4_f32[3] = 1.0f;
 839    return M;
 840
 841#elif defined(_XM_ARM_NEON_INTRINSICS_)
 842    __n128 v0 = vld1q_f32( &pSource->m[0][0] );
 843    __n128 v1 = vld1q_f32( &pSource->m[1][1] );
 844    __n64 v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] );
 845    __n128 T = vextq_f32( v0, v1, 3 );
 846
 847    XMMATRIX M;
 848    M.r[0] = vandq_u32( v0, g_XMMask3 );
 849    M.r[1] = vandq_u32( T, g_XMMask3 );
 850    M.r[2] = vcombine_f32( vget_high_f32(v1), v2 );
 851    M.r[3] = g_XMIdentityR3;
 852    return M;
 853#elif defined(_XM_SSE_INTRINSICS_)
 854    __m128 Z = _mm_setzero_ps();
 855
 856    __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] );
 857    __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] );
 858    __m128 V3 = _mm_load_ss( &pSource->m[2][2] );
 859
 860    __m128 T1 = _mm_unpackhi_ps( V1, Z );
 861    __m128 T2 = _mm_unpacklo_ps( V2, Z );
 862    __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
 863    __m128 T4 = _mm_movehl_ps( T2, T3 );
 864    __m128 T5 = _mm_movehl_ps( Z, T1 );  
 865
 866    XMMATRIX M;
 867    M.r[0] = _mm_movelh_ps( V1, T1 );
 868    M.r[1] = _mm_add_ps( T4, T5 );
 869    M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
 870    M.r[3] = g_XMIdentityR3;
 871    return M;
 872#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 873#endif // _XM_VMX128_INTRINSICS_
 874}
 875
 876//------------------------------------------------------------------------------
 877_Use_decl_annotations_
 878inline XMMATRIX XMLoadFloat4x3
 879(
 880    const XMFLOAT4X3* pSource
 881)
 882{
 883    assert(pSource);
 884#if defined(_XM_NO_INTRINSICS_)
 885
 886    XMMATRIX M;
 887    M.r[0].vector4_f32[0] = pSource->m[0][0];
 888    M.r[0].vector4_f32[1] = pSource->m[0][1];
 889    M.r[0].vector4_f32[2] = pSource->m[0][2];
 890    M.r[0].vector4_f32[3] = 0.0f;
 891
 892    M.r[1].vector4_f32[0] = pSource->m[1][0];
 893    M.r[1].vector4_f32[1] = pSource->m[1][1];
 894    M.r[1].vector4_f32[2] = pSource->m[1][2];
 895    M.r[1].vector4_f32[3] = 0.0f;
 896
 897    M.r[2].vector4_f32[0] = pSource->m[2][0];
 898    M.r[2].vector4_f32[1] = pSource->m[2][1];
 899    M.r[2].vector4_f32[2] = pSource->m[2][2];
 900    M.r[2].vector4_f32[3] = 0.0f;
 901
 902    M.r[3].vector4_f32[0] = pSource->m[3][0];
 903    M.r[3].vector4_f32[1] = pSource->m[3][1];
 904    M.r[3].vector4_f32[2] = pSource->m[3][2];
 905    M.r[3].vector4_f32[3] = 1.0f;
 906    return M;
 907
 908#elif defined(_XM_ARM_NEON_INTRINSICS_)
 909    __n128 v0 = vld1q_f32( &pSource->m[0][0] );
 910    __n128 v1 = vld1q_f32( &pSource->m[1][1] );
 911    __n128 v2 = vld1q_f32( &pSource->m[2][2] );
 912
 913    __n128 T1 = vextq_f32( v0, v1, 3 );
 914    __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
 915    __n128 T3 = vextq_f32( v2, v2, 1 );
 916
 917    XMMATRIX M;
 918    M.r[0] = vandq_u32( v0, g_XMMask3 );
 919    M.r[1] = vandq_u32( T1, g_XMMask3 );
 920    M.r[2] = vandq_u32( T2, g_XMMask3 );
 921    M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
 922    return M;
 923#elif defined(_XM_SSE_INTRINSICS_)
 924    // Use unaligned load instructions to 
 925    // load the 12 floats
 926    // vTemp1 = x1,y1,z1,x2
 927    XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
 928    // vTemp2 = y2,z2,x3,y3
 929    XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
 930    // vTemp4 = z3,x4,y4,z4
 931    XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
 932    // vTemp3 = x3,y3,z3,z3
 933    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
 934    // vTemp2 = y2,z2,x2,x2
 935    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
 936    // vTemp2 = x2,y2,z2,z2
 937    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
 938    // vTemp1 = x1,y1,z1,0
 939    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
 940    // vTemp2 = x2,y2,z2,0
 941    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
 942    // vTemp3 = x3,y3,z3,0
 943    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
 944    // vTemp4i = x4,y4,z4,0
 945    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
 946    // vTemp4i = x4,y4,z4,1.0f
 947    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
 948    XMMATRIX M(vTemp1,
 949            vTemp2,
 950            vTemp3,
 951            _mm_castsi128_ps(vTemp4i));
 952    return M;
 953#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 954#endif // _XM_VMX128_INTRINSICS_
 955}
 956
 957//------------------------------------------------------------------------------
 958_Use_decl_annotations_
 959inline XMMATRIX XMLoadFloat4x3A
 960(
 961    const XMFLOAT4X3A* pSource
 962)
 963{
 964    assert(pSource);
 965    assert(((uintptr_t)pSource & 0xF) == 0);
 966#if defined(_XM_NO_INTRINSICS_)
 967
 968    XMMATRIX M;
 969    M.r[0].vector4_f32[0] = pSource->m[0][0];
 970    M.r[0].vector4_f32[1] = pSource->m[0][1];
 971    M.r[0].vector4_f32[2] = pSource->m[0][2];
 972    M.r[0].vector4_f32[3] = 0.0f;
 973
 974    M.r[1].vector4_f32[0] = pSource->m[1][0];
 975    M.r[1].vector4_f32[1] = pSource->m[1][1];
 976    M.r[1].vector4_f32[2] = pSource->m[1][2];
 977    M.r[1].vector4_f32[3] = 0.0f;
 978
 979    M.r[2].vector4_f32[0] = pSource->m[2][0];
 980    M.r[2].vector4_f32[1] = pSource->m[2][1];
 981    M.r[2].vector4_f32[2] = pSource->m[2][2];
 982    M.r[2].vector4_f32[3] = 0.0f;
 983
 984    M.r[3].vector4_f32[0] = pSource->m[3][0];
 985    M.r[3].vector4_f32[1] = pSource->m[3][1];
 986    M.r[3].vector4_f32[2] = pSource->m[3][2];
 987    M.r[3].vector4_f32[3] = 1.0f;
 988    return M;
 989
 990#elif defined(_XM_ARM_NEON_INTRINSICS_)
 991    __n128 v0 = vld1q_f32_ex( &pSource->m[0][0], 128 );
 992    __n128 v1 = vld1q_f32_ex( &pSource->m[1][1], 128 );
 993    __n128 v2 = vld1q_f32_ex( &pSource->m[2][2], 128 );
 994
 995    __n128 T1 = vextq_f32( v0, v1, 3 );
 996    __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
 997    __n128 T3 = vextq_f32( v2, v2, 1 );
 998
 999    XMMATRIX M;
1000    M.r[0] = vandq_u32( v0, g_XMMask3 );
1001    M.r[1] = vandq_u32( T1, g_XMMask3 );
1002    M.r[2] = vandq_u32( T2, g_XMMask3 );
1003    M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
1004    return M;
1005#elif defined(_XM_SSE_INTRINSICS_)
1006    // Use aligned load instructions to 
1007    // load the 12 floats
1008    // vTemp1 = x1,y1,z1,x2
1009    XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
1010    // vTemp2 = y2,z2,x3,y3
1011    XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
1012    // vTemp4 = z3,x4,y4,z4
1013    XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
1014    // vTemp3 = x3,y3,z3,z3
1015    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
1016    // vTemp2 = y2,z2,x2,x2
1017    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
1018    // vTemp2 = x2,y2,z2,z2
1019    vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
1020    // vTemp1 = x1,y1,z1,0
1021    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
1022    // vTemp2 = x2,y2,z2,0
1023    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
1024    // vTemp3 = x3,y3,z3,0
1025    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
1026    // vTemp4i = x4,y4,z4,0
1027    __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
1028    // vTemp4i = x4,y4,z4,1.0f
1029    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
1030    XMMATRIX M(vTemp1,
1031            vTemp2,
1032            vTemp3,
1033            _mm_castsi128_ps(vTemp4i));
1034    return M;
1035#else // _XM_VMX128_INTRINSICS_
1036#endif // _XM_VMX128_INTRINSICS_
1037}
1038
1039//------------------------------------------------------------------------------
1040_Use_decl_annotations_
1041inline XMMATRIX XMLoadFloat4x4
1042(
1043    const XMFLOAT4X4* pSource
1044)
1045{
1046    assert(pSource);
1047#if defined(_XM_NO_INTRINSICS_)
1048
1049    XMMATRIX M;
1050    M.r[0].vector4_f32[0] = pSource->m[0][0];
1051    M.r[0].vector4_f32[1] = pSource->m[0][1];
1052    M.r[0].vector4_f32[2] = pSource->m[0][2];
1053    M.r[0].vector4_f32[3] = pSource->m[0][3];
1054
1055    M.r[1].vector4_f32[0] = pSource->m[1][0];
1056    M.r[1].vector4_f32[1] = pSource->m[1][1];
1057    M.r[1].vector4_f32[2] = pSource->m[1][2];
1058    M.r[1].vector4_f32[3] = pSource->m[1][3];
1059
1060    M.r[2].vector4_f32[0] = pSource->m[2][0];
1061    M.r[2].vector4_f32[1] = pSource->m[2][1];
1062    M.r[2].vector4_f32[2] = pSource->m[2][2];
1063    M.r[2].vector4_f32[3] = pSource->m[2][3];
1064
1065    M.r[3].vector4_f32[0] = pSource->m[3][0];
1066    M.r[3].vector4_f32[1] = pSource->m[3][1];
1067    M.r[3].vector4_f32[2] = pSource->m[3][2];
1068    M.r[3].vector4_f32[3] = pSource->m[3][3];
1069    return M;
1070
1071#elif defined(_XM_ARM_NEON_INTRINSICS_)
1072    XMMATRIX M;
1073    M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) );
1074    M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) );
1075    M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) );
1076    M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) );
1077    return M;
1078#elif defined(_XM_SSE_INTRINSICS_)
1079    XMMATRIX M;
1080    M.r[0] = _mm_loadu_ps( &pSource->_11 );
1081    M.r[1] = _mm_loadu_ps( &pSource->_21 );
1082    M.r[2] = _mm_loadu_ps( &pSource->_31 );
1083    M.r[3] = _mm_loadu_ps( &pSource->_41 );
1084    return M;
1085#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1086#endif // _XM_VMX128_INTRINSICS_
1087}
1088
1089//------------------------------------------------------------------------------
1090_Use_decl_annotations_
1091inline XMMATRIX XMLoadFloat4x4A
1092(
1093    const XMFLOAT4X4A* pSource
1094)
1095{
1096    assert(pSource);
1097    assert(((uintptr_t)pSource & 0xF) == 0);
1098#if defined(_XM_NO_INTRINSICS_)
1099
1100    XMMATRIX M;
1101    M.r[0].vector4_f32[0] = pSource->m[0][0];
1102    M.r[0].vector4_f32[1] = pSource->m[0][1];
1103    M.r[0].vector4_f32[2] = pSource->m[0][2];
1104    M.r[0].vector4_f32[3] = pSource->m[0][3];
1105
1106    M.r[1].vector4_f32[0] = pSource->m[1][0];
1107    M.r[1].vector4_f32[1] = pSource->m[1][1];
1108    M.r[1].vector4_f32[2] = pSource->m[1][2];
1109    M.r[1].vector4_f32[3] = pSource->m[1][3];
1110
1111    M.r[2].vector4_f32[0] = pSource->m[2][0];
1112    M.r[2].vector4_f32[1] = pSource->m[2][1];
1113    M.r[2].vector4_f32[2] = pSource->m[2][2];
1114    M.r[2].vector4_f32[3] = pSource->m[2][3];
1115
1116    M.r[3].vector4_f32[0] = pSource->m[3][0];
1117    M.r[3].vector4_f32[1] = pSource->m[3][1];
1118    M.r[3].vector4_f32[2] = pSource->m[3][2];
1119    M.r[3].vector4_f32[3] = pSource->m[3][3];
1120    return M;
1121
1122#elif defined(_XM_ARM_NEON_INTRINSICS_)
1123    XMMATRIX M;
1124    M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 );
1125    M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 );
1126    M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 );
1127    M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 );
1128    return M;
1129#elif defined(_XM_SSE_INTRINSICS_)
1130    XMMATRIX M;
1131    M.r[0] = _mm_load_ps( &pSource->_11 );
1132    M.r[1] = _mm_load_ps( &pSource->_21 );
1133    M.r[2] = _mm_load_ps( &pSource->_31 );
1134    M.r[3] = _mm_load_ps( &pSource->_41 );
1135    return M;
1136#else // _XM_VMX128_INTRINSICS_
1137#endif // _XM_VMX128_INTRINSICS_
1138}
1139
1140/****************************************************************************
1141 *
1142 * Vector and matrix store operations
1143 *
1144 ****************************************************************************/
1145_Use_decl_annotations_
1146inline void XMStoreInt
1147(
1148    uint32_t*    pDestination,
1149    FXMVECTOR V
1150)
1151{
1152    assert(pDestination);
1153#if defined(_XM_NO_INTRINSICS_)
1154    *pDestination = XMVectorGetIntX( V );
1155#elif defined(_XM_ARM_NEON_INTRINSICS_)
1156    vst1q_lane_u32( pDestination, V, 0 );
1157#elif defined(_XM_SSE_INTRINSICS_)
1158    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
1159#else // _XM_VMX128_INTRINSICS_
1160#endif // _XM_VMX128_INTRINSICS_
1161}
1162
1163//------------------------------------------------------------------------------
1164_Use_decl_annotations_
1165inline void XMStoreFloat
1166(
1167    float*    pDestination,
1168    FXMVECTOR V
1169)
1170{
1171    assert(pDestination);
1172#if defined(_XM_NO_INTRINSICS_)
1173    *pDestination = XMVectorGetX( V );
1174#elif defined(_XM_ARM_NEON_INTRINSICS_)
1175    vst1q_lane_f32( pDestination, V, 0 );
1176#elif defined(_XM_SSE_INTRINSICS_)
1177    _mm_store_ss( pDestination, V );
1178#else // _XM_VMX128_INTRINSICS_
1179#endif // _XM_VMX128_INTRINSICS_
1180}
1181
1182//------------------------------------------------------------------------------
1183_Use_decl_annotations_
1184inline void XMStoreInt2
1185(
1186    uint32_t*    pDestination, 
1187    FXMVECTOR V
1188)
1189{
1190    assert(pDestination);
1191#if defined(_XM_NO_INTRINSICS_)
1192    pDestination[0] = V.vector4_u32[0];
1193    pDestination[1] = V.vector4_u32[1];
1194#elif defined(_XM_ARM_NEON_INTRINSICS_)
1195    __n64 VL = vget_low_u32(V);
1196    vst1_u32( pDestination, VL );
1197#elif defined(_XM_SSE_INTRINSICS_)
1198    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1199    _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
1200    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
1201#else // _XM_VMX128_INTRINSICS_
1202#endif // _XM_VMX128_INTRINSICS_
1203}
1204
1205//------------------------------------------------------------------------------
1206_Use_decl_annotations_
1207inline void XMStoreInt2A
1208(
1209    uint32_t*    pDestination, 
1210    FXMVECTOR V
1211)
1212{
1213    assert(pDestination);
1214    assert(((uintptr_t)pDestination & 0xF) == 0);
1215#if defined(_XM_NO_INTRINSICS_)
1216    pDestination[0] = V.vector4_u32[0];
1217    pDestination[1] = V.vector4_u32[1];
1218#elif defined(_XM_ARM_NEON_INTRINSICS_)
1219    __n64 VL = vget_low_u32(V);
1220    vst1_u32_ex( pDestination, VL, 64 );
1221#elif defined(_XM_SSE_INTRINSICS_)
1222    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1223#else // _XM_VMX128_INTRINSICS_
1224#endif // _XM_VMX128_INTRINSICS_
1225}
1226
1227//------------------------------------------------------------------------------
1228_Use_decl_annotations_
1229inline void XMStoreFloat2
1230(
1231    XMFLOAT2* pDestination, 
1232    FXMVECTOR  V
1233)
1234{
1235    assert(pDestination);
1236#if defined(_XM_NO_INTRINSICS_)
1237    pDestination->x = V.vector4_f32[0];
1238    pDestination->y = V.vector4_f32[1];
1239#elif defined(_XM_ARM_NEON_INTRINSICS_)
1240    __n64 VL = vget_low_f32(V);
1241    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
1242#elif defined(_XM_SSE_INTRINSICS_)
1243    XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1244    _mm_store_ss( &pDestination->x, V );
1245    _mm_store_ss( &pDestination->y, T );
1246#else // _XM_VMX128_INTRINSICS_
1247#endif // _XM_VMX128_INTRINSICS_
1248}
1249
1250//------------------------------------------------------------------------------
1251_Use_decl_annotations_
1252inline void XMStoreFloat2A
1253(
1254    XMFLOAT2A*   pDestination, 
1255    FXMVECTOR     V
1256)
1257{
1258    assert(pDestination);
1259    assert(((uintptr_t)pDestination & 0xF) == 0);
1260#if defined(_XM_NO_INTRINSICS_)
1261    pDestination->x = V.vector4_f32[0];
1262    pDestination->y = V.vector4_f32[1];
1263#elif defined(_XM_ARM_NEON_INTRINSICS_)
1264    __n64 VL = vget_low_f32(V);
1265    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
1266#elif defined(_XM_SSE_INTRINSICS_)
1267    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1268#else // _XM_VMX128_INTRINSICS_
1269#endif // _XM_VMX128_INTRINSICS_
1270}
1271
1272//------------------------------------------------------------------------------
1273_Use_decl_annotations_
1274inline void XMStoreSInt2
1275(
1276    XMINT2* pDestination,
1277    FXMVECTOR V
1278)
1279{
1280    assert(pDestination);
1281#if defined(_XM_NO_INTRINSICS_)
1282    pDestination->x = (int32_t)V.vector4_f32[0];
1283    pDestination->y = (int32_t)V.vector4_f32[1];
1284#elif defined(_XM_ARM_NEON_INTRINSICS_)
1285    __n64 v = vget_low_s32(V);
1286    v = vcvt_s32_f32( v );
1287    vst1_s32( reinterpret_cast<int32_t*>(pDestination), v );
1288#elif defined(_XM_SSE_INTRINSICS_)
1289    // In case of positive overflow, detect it
1290    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
1291    // Float to int conversion
1292    __m128i vResulti = _mm_cvttps_epi32(V);
1293    // If there was positive overflow, set to 0x7FFFFFFF
1294    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
1295    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
1296    vOverflow = _mm_or_ps(vOverflow,vResult);
1297    // Write two ints
1298    XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1299    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
1300    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
1301#else // _XM_VMX128_INTRINSICS_
1302#endif // _XM_VMX128_INTRINSICS_
1303}
1304
1305//------------------------------------------------------------------------------
1306_Use_decl_annotations_
1307inline void XMStoreUInt2
1308(
1309    XMUINT2* pDestination,
1310    FXMVECTOR V
1311)
1312{
1313    assert(pDestination);
1314#if defined(_XM_NO_INTRINSICS_)
1315    pDestination->x = (uint32_t)V.vector4_f32[0];
1316    pDestination->y = (uint32_t)V.vector4_f32[1];
1317#elif defined(_XM_ARM_NEON_INTRINSICS_)
1318    __n64 v = vget_low_u32(V);
1319    v = vcvt_u32_f32( v );
1320    vst1_u32( reinterpret_cast<uint32_t*>(pDestination), v );
1321#elif defined(_XM_SSE_INTRINSICS_)
1322    // Clamp to >=0
1323    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1324    // Any numbers that are too big, set to 0xFFFFFFFFU
1325    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
1326    XMVECTOR vValue = g_XMUnsignedFix;
1327    // Too large for a signed integer?
1328    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
1329    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
1330    vValue = _mm_and_ps(vValue,vMask);
1331    // Perform fixup only on numbers too large (Keeps low bit precision)
1332    vResult = _mm_sub_ps(vResult,vValue);
1333    __m128i vResulti = _mm_cvttps_epi32(vResult);
1334    // Convert from signed to unsigned pnly if greater than 0x80000000
1335    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
1336    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
1337    // On those that are too large, set to 0xFFFFFFFF
1338    vResult = _mm_or_ps(vResult,vOverflow);
1339    // Write two uints
1340    XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1341    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
1342    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
1343#else // _XM_VMX128_INTRINSICS_
1344#endif // _XM_VMX128_INTRINSICS_
1345}
1346
1347//------------------------------------------------------------------------------
1348_Use_decl_annotations_
1349inline void XMStoreInt3
1350(
1351    uint32_t*    pDestination, 
1352    FXMVECTOR V
1353)
1354{
1355    assert(pDestination);
1356#if defined(_XM_NO_INTRINSICS_)
1357    pDestination[0] = V.vector4_u32[0];
1358    pDestination[1] = V.vector4_u32[1];
1359    pDestination[2] = V.vector4_u32[2];
1360#elif defined(_XM_ARM_NEON_INTRINSICS_)
1361    __n64 VL = vget_low_u32(V);
1362    vst1_u32( pDestination, VL );
1363    vst1q_lane_u32( pDestination+2, V, 2 );
1364#elif defined(_XM_SSE_INTRINSICS_)
1365    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
1366    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1367    _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
1368    _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
1369    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
1370#else // _XM_VMX128_INTRINSICS_
1371#endif // _XM_VMX128_INTRINSICS_
1372}
1373
1374//------------------------------------------------------------------------------
1375_Use_decl_annotations_
1376inline void XMStoreInt3A
1377(
1378    uint32_t*    pDestination, 
1379    FXMVECTOR V
1380)
1381{
1382    assert(pDestination);
1383    assert(((uintptr_t)pDestination & 0xF) == 0);
1384#if defined(_XM_NO_INTRINSICS_)
1385    pDestination[0] = V.vector4_u32[0];
1386    pDestination[1] = V.vector4_u32[1];
1387    pDestination[2] = V.vector4_u32[2];
1388#elif defined(_XM_ARM_NEON_INTRINSICS_)
1389    __n64 VL = vget_low_u32(V);
1390    vst1_u32_ex( pDestination, VL, 64 );
1391    vst1q_lane_u32( pDestination+2, V, 2 );
1392#elif defined(_XM_SSE_INTRINSICS_)
1393    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1394    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1395    _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
1396#else // _XM_VMX128_INTRINSICS_
1397#endif // _XM_VMX128_INTRINSICS_
1398}
1399
1400//------------------------------------------------------------------------------
1401_Use_decl_annotations_
1402inline void XMStoreFloat3
1403(
1404    XMFLOAT3* pDestination, 
1405    FXMVECTOR V
1406)
1407{
1408    assert(pDestination);
1409#if defined(_XM_NO_INTRINSICS_)
1410    pDestination->x = V.vector4_f32[0];
1411    pDestination->y = V.vector4_f32[1];
1412    pDestination->z = V.vector4_f32[2];
1413#elif defined(_XM_ARM_NEON_INTRINSICS_)
1414    __n64 VL = vget_low_f32(V);
1415    vst1_f32( reinterpret_cast<float*>(pDestination), VL );
1416    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
1417#elif defined(_XM_SSE_INTRINSICS_)
1418    XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
1419    XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1420    _mm_store_ss( &pDestination->x, V );
1421    _mm_store_ss( &pDestination->y, T1 );
1422    _mm_store_ss( &pDestination->z, T2 );
1423#else // _XM_VMX128_INTRINSICS_
1424#endif // _XM_VMX128_INTRINSICS_
1425}
1426
1427//------------------------------------------------------------------------------
1428_Use_decl_annotations_
1429inline void XMStoreFloat3A
1430(
1431    XMFLOAT3A*   pDestination, 
1432    FXMVECTOR     V
1433)
1434{
1435    assert(pDestination);
1436    assert(((uintptr_t)pDestination & 0xF) == 0);
1437#if defined(_XM_NO_INTRINSICS_)
1438    pDestination->x = V.vector4_f32[0];
1439    pDestination->y = V.vector4_f32[1];
1440    pDestination->z = V.vector4_f32[2];
1441#elif defined(_XM_ARM_NEON_INTRINSICS_)
1442    __n64 VL = vget_low_f32(V);
1443    vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
1444    vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
1445#elif defined(_XM_SSE_INTRINSICS_)
1446    XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1447    _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1448    _mm_store_ss( &pDestination->z, T );
1449#else // _XM_VMX128_INTRINSICS_
1450#endif // _XM_VMX128_INTRINSICS_
1451}
1452
1453//------------------------------------------------------------------------------
1454_Use_decl_annotations_
1455inline void XMStoreSInt3
1456(
1457    XMINT3* pDestination,
1458    FXMVECTOR V
1459)
1460{
1461    assert(pDestination);
1462#if defined(_XM_NO_INTRINSICS_)
1463    pDestination->x = (int32_t)V.vector4_f32[0];
1464    pDestination->y = (int32_t)V.vector4_f32[1];
1465    pDestination->z = (int32_t)V.vector4_f32[2];
1466#elif defined(_XM_ARM_NEON_INTRINSICS_)
1467    __n128 v = vcvtq_s32_f32(V);
1468    __n64 vL = vget_low_s32(v);
1469    vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL );
1470    vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 );
1471#elif defined(_XM_SSE_INTRINSICS_)
1472    // In case of positive overflow, detect it
1473    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
1474    // Float to int conversion
1475    __m128i vResulti = _mm_cvttps_epi32(V);
1476    // If there was positive overflow, set to 0x7FFFFFFF
1477    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
1478    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
1479    vOverflow = _mm_or_ps(vOverflow,vResult);
1480    // Write 3 uints
1481    XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
1482    XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
1483    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
1484    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
1485    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
1486#else // _XM_VMX128_INTRINSICS_
1487#endif // _XM_VMX128_INTRINSICS_
1488}
1489
1490//------------------------------------------------------------------------------
1491_Use_decl_annotations_
1492inline void XMStoreUInt3
1493(
1494    XMUINT3* pDestination,
1495    FXMVECTOR V
1496)
1497{
1498    assert(pDestination);
1499#if defined(_XM_NO_INTRINSICS_)
1500    pDestination->x = (uint32_t)V.vector4_f32[0];
1501    pDestination->y = (uint32_t)V.vector4_f32[1];
1502    pDestination->z = (uint32_t)V.vector4_f32[2];
1503#elif defined(_XM_ARM_NEON_INTRINSICS_)
1504    __n128 v = vcvtq_u32_f32(V);
1505    __n64 vL = vget_low_u32(v);
1506    vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL );
1507    vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 );
1508#elif defined(_XM_SSE_INTRINSICS_)
1509    // Clamp to >=0
1510    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1511    // Any numbers that are too big, set to 0xFFFFFFFFU
1512    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
1513    XMVECTOR vValue = g_XMUnsignedFix;
1514    // Too large for a signed integer?
1515    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
1516    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
1517    vValue = _mm_and_ps(vValue,vMask);
1518    // Perform fixup only on numbers too large (Keeps low bit precision)
1519    vResult = _mm_sub_ps(vResult,vValue);
1520    __m128i vResulti = _mm_cvttps_epi32(vResult);
1521    // Convert from signed to unsigned pnly if greater than 0x80000000
1522    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
1523    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
1524    // On those that are too large, set to 0xFFFFFFFF
1525    vResult = _mm_or_ps(vResult,vOverflow);
1526    // Write 3 uints
1527    XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
1528    XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
1529    _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
1530    _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
1531    _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
1532#else // _XM_VMX128_INTRINSICS_
1533#endif // _XM_VMX128_INTRINSICS_
1534}
1535
1536//------------------------------------------------------------------------------
1537_Use_decl_annotations_
1538inline void XMStoreInt4
1539(
1540    uint32_t*    pDestination, 
1541    FXMVECTOR V
1542)
1543{
1544    assert(pDestination);
1545#if defined(_XM_NO_INTRINSICS_)
1546    pDestination[0] = V.vector4_u32[0];
1547    pDestination[1] = V.vector4_u32[1];
1548    pDestination[2] = V.vector4_u32[2];
1549    pDestination[3] = V.vector4_u32[3];
1550#elif defined(_XM_ARM_NEON_INTRINSICS_)
1551    vst1q_u32( pDestination, V );
1552#elif defined(_XM_SSE_INTRINSICS_)
1553    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1554#else // _XM_VMX128_INTRINSICS_
1555#endif // _XM_VMX128_INTRINSICS_
1556}
1557
1558//------------------------------------------------------------------------------
1559_Use_decl_annotations_
1560inline void XMStoreInt4A
1561(
1562    uint32_t*    pDestination, 
1563    FXMVECTOR V
1564)
1565{
1566    assert(pDestination);
1567    assert(((uintptr_t)pDestination & 0xF) == 0);
1568#if defined(_XM_NO_INTRINSICS_)
1569    pDestination[0] = V.vector4_u32[0];
1570    pDestination[1] = V.vector4_u32[1];
1571    pDestination[2] = V.vector4_u32[2];
1572    pDestination[3] = V.vector4_u32[3];
1573#elif defined(_XM_ARM_NEON_INTRINSICS_)
1574    vst1q_u32_ex( pDestination, V, 128 );
1575#elif defined(_XM_SSE_INTRINSICS_)
1576    _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1577#else // _XM_VMX128_INTRINSICS_
1578#endif // _XM_VMX128_INTRINSICS_
1579}
1580
1581
1582//------------------------------------------------------------------------------
1583_Use_decl_annotations_
1584inline void XMStoreFloat4
1585(
1586    XMFLOAT4* pDestination, 
1587    FXMVECTOR  V
1588)
1589{
1590    assert(pDestination);
1591#if defined(_XM_NO_INTRINSICS_)
1592    pDestination->x = V.vector4_f32[0];
1593    pDestination->y = V.vector4_f32[1];
1594    pDestination->z = V.vector4_f32[2];
1595    pDestination->w = V.vector4_f32[3];
1596#elif defined(_XM_ARM_NEON_INTRINSICS_)
1597    vst1q_f32( reinterpret_cast<float*>(pDestination), V );
1598#elif defined(_XM_SSE_INTRINSICS_)
1599    _mm_storeu_ps( &pDestination->x, V );
1600#else // _XM_VMX128_INTRINSICS_
1601#endif // _XM_VMX128_INTRINSICS_
1602}
1603
1604//------------------------------------------------------------------------------
1605_Use_decl_annotations_
1606inline void XMStoreFloat4A
1607(
1608    XMFLOAT4A*   pDestination, 
1609    FXMVECTOR     V
1610)
1611{
1612    assert(pDestination);
1613    assert(((uintptr_t)pDestination & 0xF) == 0);
1614#if defined(_XM_NO_INTRINSICS_)
1615    pDestination->x = V.vector4_f32[0];
1616    pDestination->y = V.vector4_f32[1];
1617    pDestination->z = V.vector4_f32[2];
1618    pDestination->w = V.vector4_f32[3];
1619#elif defined(_XM_ARM_NEON_INTRINSICS_)
1620    vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 );
1621#elif defined(_XM_SSE_INTRINSICS_)
1622    _mm_store_ps( &pDestination->x, V );
1623#else // _XM_VMX128_INTRINSICS_
1624#endif // _XM_VMX128_INTRINSICS_
1625}
1626
1627
1628//------------------------------------------------------------------------------
1629_Use_decl_annotations_
1630inline void XMStoreSInt4
1631(
1632    XMINT4* pDestination,
1633    FXMVECTOR V
1634)
1635{
1636    assert(pDestination);
1637#if defined(_XM_NO_INTRINSICS_)
1638    pDestination->x = (int32_t)V.vector4_f32[0];
1639    pDestination->y = (int32_t)V.vector4_f32[1];
1640    pDestination->z = (int32_t)V.vector4_f32[2];
1641    pDestination->w = (int32_t)V.vector4_f32[3];
1642#elif defined(_XM_ARM_NEON_INTRINSICS_)
1643    __n128 v = vcvtq_s32_f32(V);
1644    vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v );
1645#elif defined(_XM_SSE_INTRINSICS_)
1646    // In case of positive overflow, detect it
1647    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
1648    // Float to int conversion
1649    __m128i vResulti = _mm_cvttps_epi32(V);
1650    // If there was positive overflow, set to 0x7FFFFFFF
1651    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
1652    vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
1653    vOverflow = _mm_or_ps(vOverflow,vResult);
1654    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
1655#else // _XM_VMX128_INTRINSICS_
1656#endif // _XM_VMX128_INTRINSICS_
1657}
1658
1659//------------------------------------------------------------------------------
1660_Use_decl_annotations_
1661inline void XMStoreUInt4
1662(
1663    XMUINT4* pDestination,
1664    FXMVECTOR V
1665)
1666{
1667    assert(pDestination);
1668#if defined(_XM_NO_INTRINSICS_)
1669    pDestination->x = (uint32_t)V.vector4_f32[0];
1670    pDestination->y = (uint32_t)V.vector4_f32[1];
1671    pDestination->z = (uint32_t)V.vector4_f32[2];
1672    pDestination->w = (uint32_t)V.vector4_f32[3];
1673#elif defined(_XM_ARM_NEON_INTRINSICS_)
1674    __n128 v = vcvtq_u32_f32(V);
1675    vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v );
1676#elif defined(_XM_SSE_INTRINSICS_)
1677    // Clamp to >=0
1678    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1679    // Any numbers that are too big, set to 0xFFFFFFFFU
1680    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
1681    XMVECTOR vValue = g_XMUnsignedFix;
1682    // Too large for a signed integer?
1683    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
1684    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
1685    vValue = _mm_and_ps(vValue,vMask);
1686    // Perform fixup only on numbers too large (Keeps low bit precision)
1687    vResult = _mm_sub_ps(vResult,vValue);
1688    __m128i vResulti = _mm_cvttps_epi32(vResult);
1689    // Convert from signed to unsigned pnly if greater than 0x80000000
1690    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
1691    vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
1692    // On those that are too large, set to 0xFFFFFFFF
1693    vResult = _mm_or_ps(vResult,vOverflow);
1694    _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
1695#else // _XM_VMX128_INTRINSICS_
1696#endif // _XM_VMX128_INTRINSICS_
1697}
1698
1699//------------------------------------------------------------------------------
1700_Use_decl_annotations_
1701inline void XMStoreFloat3x3
1702(
1703    XMFLOAT3X3*	pDestination, 
1704    CXMMATRIX	M
1705)
1706{
1707    assert(pDestination);
1708#if defined(_XM_NO_INTRINSICS_)
1709
1710    pDestination->m[0][0] = M.r[0].vector4_f32[0];
1711    pDestination->m[0][1] = M.r[0].vector4_f32[1];
1712    pDestination->m[0][2] = M.r[0].vector4_f32[2];
1713
1714    pDestination->m[1][0] = M.r[1].vector4_f32[0];
1715    pDestination->m[1][1] = M.r[1].vector4_f32[1];
1716    pDestination->m[1][2] = M.r[1].vector4_f32[2];
1717
1718    pDestination->m[2][0] = M.r[2].vector4_f32[0];
1719    pDestination->m[2][1] = M.r[2].vector4_f32[1];
1720    pDestination->m[2][2] = M.r[2].vector4_f32[2];
1721
1722#elif defined(_XM_ARM_NEON_INTRINSICS_)
1723    __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 );
1724    __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
1725    vst1q_f32( &pDestination->m[0][0], T2 );
1726
1727    T1 = vextq_f32( M.r[1], M.r[1], 1 );
1728    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
1729    vst1q_f32( &pDestination->m[1][1], T2 );
1730
1731    vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 );
1732#elif defined(_XM_SSE_INTRINSICS_)
1733    XMVECTOR vTemp1 = M.r[0];
1734    XMVECTOR vTemp2 = M.r[1];
1735    XMVECTOR vTemp3 = M.r[2];
1736    XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
1737    vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
1738    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
1739    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
1740    _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
1741    vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
1742    _mm_store_ss(&pDestination->m[2][2],vTemp3);
1743#else // _XM_VMX128_INTRINSICS_
1744#endif // _XM_VMX128_INTRINSICS_
1745}
1746
1747
1748//------------------------------------------------------------------------------
1749_Use_decl_annotations_
1750inline void XMStoreFloat4x3
1751(
1752    XMFLOAT4X3* pDestination, 
1753    CXMMATRIX M
1754)
1755{
1756    assert(pDestination);
1757#if defined(_XM_NO_INTRINSICS_)
1758
1759    pDestination->m[0][0] = M.r[0].vector4_f32[0];
1760    pDestination->m[0][1] = M.r[0].vector4_f32[1];
1761    pDestination->m[0][2] = M.r[0].vector4_f32[2];
1762
1763    pDestination->m[1][0] = M.r[1].vector4_f32[0];
1764    pDestination->m[1][1] = M.r[1].vector4_f32[1];
1765    pDestination->m[1][2] = M.r[1].vector4_f32[2];
1766
1767    pDestination->m[2][0] = M.r[2].vector4_f32[0];
1768    pDestination->m[2][1] = M.r[2].vector4_f32[1];
1769    pDestination->m[2][2] = M.r[2].vector4_f32[2];
1770
1771    pDestination->m[3][0] = M.r[3].vector4_f32[0];
1772    pDestination->m[3][1] = M.r[3].vector4_f32[1];
1773    pDestination->m[3][2] = M.r[3].vector4_f32[2];
1774
1775#elif defined(_XM_ARM_NEON_INTRINSICS_)
1776    __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 );
1777    __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
1778    vst1q_f32( &pDestination->m[0][0], T2 );
1779
1780    T1 = vextq_f32( M.r[1], M.r[1], 1 );
1781    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
1782    vst1q_f32( &pDestination->m[1][1], T2 );
1783
1784    T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
1785    T2 = vextq_f32( T1, M.r[3], 3 );
1786    vst1q_f32( &pDestination->m[2][2], T2 );
1787#elif defined(_XM_SSE_INTRINSICS_)
1788    XMVECTOR vTemp1 = M.r[0];
1789    XMVECTOR vTemp2 = M.r[1];
1790    XMVECTOR vTemp3 = M.r[2];
1791    XMVECTOR vTemp4 = M.r[3];
1792    XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
1793    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
1794    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
1795    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
1796    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
1797    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
1798    _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
1799    _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
1800#else // _XM_VMX128_INTRINSICS_
1801#endif // _XM_VMX128_INTRINSICS_
1802}
1803
1804//------------------------------------------------------------------------------
1805_Use_decl_annotations_
1806inline void XMStoreFloat4x3A
1807(
1808    XMFLOAT4X3A*	pDestination, 
1809    CXMMATRIX		M
1810)
1811{
1812    assert(pDestination);
1813    assert(((uintptr_t)pDestination & 0xF) == 0);
1814#if defined(_XM_NO_INTRINSICS_)
1815
1816    pDestination->m[0][0] = M.r[0].vector4_f32[0];
1817    pDestination->m[0][1] = M.r[0].vector4_f32[1];
1818    pDestination->m[0][2] = M.r[0].vector4_f32[2];
1819
1820    pDestination->m[1][0] = M.r[1].vector4_f32[0];
1821    pDestination->m[1][1] = M.r[1].vector4_f32[1];
1822    pDestination->m[1][2] = M.r[1].vector4_f32[2];
1823
1824    pDestination->m[2][0] = M.r[2].vector4_f32[0];
1825    pDestination->m[2][1] = M.r[2].vector4_f32[1];
1826    pDestination->m[2][2] = M.r[2].vector4_f32[2];
1827
1828    pDestination->m[3][0] = M.r[3].vector4_f32[0];
1829    pDestination->m[3][1] = M.r[3].vector4_f32[1];
1830    pDestination->m[3][2] = M.r[3].vector4_f32[2];
1831
1832#elif defined(_XM_ARM_NEON_INTRINSICS_)
1833    __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 );
1834    __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
1835    vst1q_f32_ex( &pDestination->m[0][0], T2, 128 );
1836
1837    T1 = vextq_f32( M.r[1], M.r[1], 1 );
1838    T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
1839    vst1q_f32_ex( &pDestination->m[1][1], T2, 128 );
1840
1841    T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
1842    T2 = vextq_f32( T1, M.r[3], 3 );
1843    vst1q_f32_ex( &pDestination->m[2][2], T2, 128 );
1844#elif defined(_XM_SSE_INTRINSICS_)
1845    // x1,y1,z1,w1
1846    XMVECTOR vTemp1 = M.r[0];
1847    // x2,y2,z2,w2
1848    XMVECTOR vTemp2 = M.r[1];
1849    // x3,y3,z3,w3
1850    XMVECTOR vTemp3 = M.r[2];
1851    // x4,y4,z4,w4
1852    XMVECTOR vTemp4 = M.r[3];
1853    // z1,z1,x2,y2
1854    XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
1855    // y2,z2,x3,y3 (Final)
1856    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
1857    // x1,y1,z1,x2 (Final)
1858    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
1859    // z3,z3,x4,x4
1860    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
1861    // z3,x4,y4,z4 (Final)
1862    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
1863    // Store in 3 operations
1864    _mm_store_ps(&pDestination->m[0][0],vTemp1);
1865    _mm_store_ps(&pDestination->m[1][1],vTemp2);
1866    _mm_store_ps(&pDestination->m[2][2],vTemp3);
1867#else // _XM_VMX128_INTRINSICS_
1868#endif // _XM_VMX128_INTRINSICS_
1869}
1870
1871
1872//------------------------------------------------------------------------------
1873_Use_decl_annotations_
1874inline void XMStoreFloat4x4
1875(
1876    XMFLOAT4X4* pDestination, 
1877    CXMMATRIX M
1878)
1879{
1880    assert(pDestination);
1881#if defined(_XM_NO_INTRINSICS_)
1882
1883    pDestination->m[0][0] = M.r[0].vector4_f32[0];
1884    pDestination->m[0][1] = M.r[0].vector4_f32[1];
1885    pDestination->m[0][2] = M.r[0].vector4_f32[2];
1886    pDestination->m[0][3] = M.r[0].vector4_f32[3];
1887
1888    pDestination->m[1][0] = M.r[1].vector4_f32[0];
1889    pDestination->m[1][1] = M.r[1].vector4_f32[1];
1890    pDestination->m[1][2] = M.r[1].vector4_f32[2];
1891    pDestination->m[1][3] = M.r[1].vector4_f32[3];
1892
1893    pDestination->m[2][0] = M.r[2].vector4_f32[0];
1894    pDestination->m[2][1] = M.r[2].vector4_f32[1];
1895    pDestination->m[2][2] = M.r[2].vector4_f32[2];
1896    pDestination->m[2][3] = M.r[2].vector4_f32[3];
1897
1898    pDestination->m[3][0] = M.r[3].vector4_f32[0];
1899    pDestination->m[3][1] = M.r[3].vector4_f32[1];
1900    pDestination->m[3][2] = M.r[3].vector4_f32[2];
1901    pDestination->m[3][3] = M.r[3].vector4_f32[3];
1902
1903#elif defined(_XM_ARM_NEON_INTRINSICS_)
1904    vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] );
1905    vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] );
1906    vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] );
1907    vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] );
1908#elif defined(_XM_SSE_INTRINSICS_)
1909    _mm_storeu_ps( &pDestination->_11, M.r[0] );
1910    _mm_storeu_ps( &pDestination->_21, M.r[1] );
1911    _mm_storeu_ps( &pDestination->_31, M.r[2] );
1912    _mm_storeu_ps( &pDestination->_41, M.r[3] );
1913#else // _XM_VMX128_INTRINSICS_
1914#endif // _XM_VMX128_INTRINSICS_
1915}
1916
1917//------------------------------------------------------------------------------
1918_Use_decl_annotations_
1919inline void XMStoreFloat4x4A
1920(
1921    XMFLOAT4X4A*	pDestination, 
1922    CXMMATRIX		M
1923)
1924{
1925    assert(pDestination);
1926    assert(((uintptr_t)pDestination & 0xF) == 0);
1927#if defined(_XM_NO_INTRINSICS_)
1928
1929    pDestination->m[0][0] = M.r[0].vector4_f32[0];
1930    pDestination->m[0][1] = M.r[0].vector4_f32[1];
1931    pDestination->m[0][2] = M.r[0].vector4_f32[2];
1932    pDestination->m[0][3] = M.r[0].vector4_f32[3];
1933
1934    pDestination->m[1][0] = M.r[1].vector4_f32[0];
1935    pDestination->m[1][1] = M.r[1].vector4_f32[1];
1936    pDestination->m[1][2] = M.r[1].vector4_f32[2];
1937    pDestination->m[1][3] = M.r[1].vector4_f32[3];
1938
1939    pDestination->m[2][0] = M.r[2].vector4_f32[0];
1940    pDestination->m[2][1] = M.r[2].vector4_f32[1];
1941    pDestination->m[2][2] = M.r[2].vector4_f32[2];
1942    pDestination->m[2][3] = M.r[2].vector4_f32[3];
1943
1944    pDestination->m[3][0] = M.r[3].vector4_f32[0];
1945    pDestination->m[3][1] = M.r[3].vector4_f32[1];
1946    pDestination->m[3][2] = M.r[3].vector4_f32[2];
1947    pDestination->m[3][3] = M.r[3].vector4_f32[3];
1948
1949#elif defined(_XM_ARM_NEON_INTRINSICS_)
1950    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 );
1951    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 );
1952    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 );
1953    vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 );
1954#elif defined(_XM_SSE_INTRINSICS_)
1955    _mm_store_ps( &pDestination->_11, M.r[0] );
1956    _mm_store_ps( &pDestination->_21, M.r[1] );
1957    _mm_store_ps( &pDestination->_31, M.r[2] );
1958    _mm_store_ps( &pDestination->_41, M.r[3] );
1959#else // _XM_VMX128_INTRINSICS_
1960#endif // _XM_VMX128_INTRINSICS_
1961}
1962