Minecraft.Client/PS3/PS3Extras/DirectX/DirectXPackedVector.inl at master

ocrm.bsky.social / VoxelBlockGame
fork atom
the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
fork atom
VoxelBlockGame / Minecraft.Client / PS3 / PS3Extras / DirectX / DirectXPackedVector.inl
at master 3545 lines 111 kB view raw
wrap content
daoge_cmd Initial commit 17d ago
b691c43c
   1//-------------------------------------------------------------------------------------
   2// DirectXPackedVector.inl -- SIMD C++ Math library
   3//
   4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
   5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
   6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
   7// PARTICULAR PURPOSE.
   8//  
   9// Copyright (c) Microsoft Corporation. All rights reserved.
  10//-------------------------------------------------------------------------------------
  11
  12#ifdef _MSC_VER
  13#pragma once
  14#endif
  15
  16
  17/****************************************************************************
  18 *
  19 * Data conversion
  20 *
  21 ****************************************************************************/
  22
  23//------------------------------------------------------------------------------
  24
  25inline float PackedVector::XMConvertHalfToFloat
  26(
  27    HALF Value
  28)
  29{
  30#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
  31
  32    uint32_t Mantissa = (uint32_t)(Value & 0x03FF);
  33
  34    uint32_t Exponent;
  35    if ((Value & 0x7C00) != 0)  // The value is normalized
  36    {
  37        Exponent = (uint32_t)((Value >> 10) & 0x1F);
  38    }
  39    else if (Mantissa != 0)     // The value is denormalized
  40    {
  41        // Normalize the value in the resulting float
  42        Exponent = 1;
  43
  44        do
  45        {
  46            Exponent--;
  47            Mantissa <<= 1;
  48        } while ((Mantissa & 0x0400) == 0);
  49
  50        Mantissa &= 0x03FF;
  51    }
  52    else                        // The value is zero
  53    {
  54        Exponent = (uint32_t)-112;
  55    }
  56
  57    uint32_t Result = ((Value & 0x8000) << 16) | // Sign
  58                      ((Exponent + 112) << 23) | // Exponent
  59                      (Mantissa << 13);          // Mantissa
  60
  61    return reinterpret_cast<float*>(&Result)[0];
  62#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  63#endif
  64}
  65
  66//------------------------------------------------------------------------------
  67_Use_decl_annotations_
  68inline float* PackedVector::XMConvertHalfToFloatStream
  69(
  70    float*      pOutputStream, 
  71    size_t      OutputStride, 
  72    const HALF* pInputStream, 
  73    size_t      InputStride, 
  74    size_t      HalfCount
  75)
  76{
  77    assert(pOutputStream);
  78    assert(pInputStream);
  79#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
  80
  81    const uint8_t* pHalf = reinterpret_cast<const uint8_t*>(pInputStream);
  82    uint8_t* pFloat = reinterpret_cast<uint8_t*>(pOutputStream);
  83
  84    for (size_t i = 0; i < HalfCount; i++)
  85    {
  86        *reinterpret_cast<float*>(pFloat) = XMConvertHalfToFloat(reinterpret_cast<const HALF*>(pHalf)[0]);
  87        pHalf += InputStride;
  88        pFloat += OutputStride; 
  89    }
  90
  91    return pOutputStream;
  92
  93#else // _XM_VMX128_INTRINSICS_
  94#endif // _XM_VMX128_INTRINSICS_
  95}
  96
  97//------------------------------------------------------------------------------
  98
  99inline PackedVector::HALF PackedVector::XMConvertFloatToHalf
 100(
 101    float Value
 102)
 103{
 104#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 105    uint32_t Result;
 106
 107    uint32_t IValue = reinterpret_cast<uint32_t *>(&Value)[0];
 108    uint32_t Sign = (IValue & 0x80000000U) >> 16U;
 109    IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
 110
 111    if (IValue > 0x47FFEFFFU)
 112    {
 113        // The number is too large to be represented as a half.  Saturate to infinity.
 114        Result = 0x7FFFU;
 115    }
 116    else
 117    {
 118        if (IValue < 0x38800000U)
 119        {
 120            // The number is too small to be represented as a normalized half.
 121            // Convert it to a denormalized value.
 122            uint32_t Shift = 113U - (IValue >> 23U);
 123            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
 124        }
 125        else
 126        {
 127            // Rebias the exponent to represent the value as a normalized half.
 128            IValue += 0xC8000000U;
 129        }
 130
 131        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; 
 132    }
 133    return (HALF)(Result|Sign);
 134#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 135#endif
 136}
 137
 138//------------------------------------------------------------------------------
 139_Use_decl_annotations_
 140inline PackedVector::HALF* PackedVector::XMConvertFloatToHalfStream
 141(
 142    HALF* pOutputStream, 
 143    size_t       OutputStride, 
 144    const float* pInputStream, 
 145    size_t       InputStride, 
 146    size_t       FloatCount
 147)
 148{
 149    assert(pOutputStream);
 150    assert(pInputStream);
 151#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 152
 153    const uint8_t* pFloat = reinterpret_cast<const uint8_t*>(pInputStream);
 154    uint8_t* pHalf = reinterpret_cast<uint8_t*>(pOutputStream);
 155
 156    for (size_t i = 0; i < FloatCount; i++)
 157    {
 158        *reinterpret_cast<HALF*>(pHalf) = XMConvertFloatToHalf(reinterpret_cast<const float*>(pFloat)[0]);
 159        pFloat += InputStride; 
 160        pHalf += OutputStride;
 161    }
 162    return pOutputStream;
 163
 164#else // _XM_VMX128_INTRINSICS_
 165#endif // _XM_VMX128_INTRINSICS_
 166}
 167
 168/****************************************************************************
 169 *
 170 * Vector and matrix load operations
 171 *
 172 ****************************************************************************/
 173_Use_decl_annotations_
 174inline XMVECTOR PackedVector::XMLoadColor
 175(
 176    const XMCOLOR* pSource
 177)
 178{
 179    assert(pSource);
 180#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 181    // int32_t -> Float conversions are done in one instruction.
 182    // uint32_t -> Float calls a runtime function. Keep in int32_t
 183    int32_t iColor = (int32_t)(pSource->c);
 184    XMVECTORF32 vColor = {
 185        (float)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
 186        (float)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
 187        (float)(iColor & 0xFF) * (1.0f/255.0f),
 188        (float)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
 189    };
 190    return vColor.v;
 191#elif defined(_XM_SSE_INTRINSICS_)
 192    // Splat the color in all four entries
 193    __m128i vInt = _mm_set1_epi32(pSource->c);
 194    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
 195    vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
 196    // a is unsigned! Flip the bit to convert the order to signed
 197    vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
 198    // Convert to floating point numbers
 199    XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
 200    // RGB + 0, A + 0x80000000.f to undo the signed order.
 201    vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
 202    // Convert 0-255 to 0.0f-1.0f
 203    return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
 204#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 205#endif // _XM_VMX128_INTRINSICS_
 206}
 207
 208//------------------------------------------------------------------------------
 209_Use_decl_annotations_
 210inline XMVECTOR PackedVector::XMLoadHalf2
 211(
 212    const XMHALF2* pSource
 213)
 214{
 215    assert(pSource);
 216#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 217    XMVECTORF32 vResult = {
 218        XMConvertHalfToFloat(pSource->x),
 219        XMConvertHalfToFloat(pSource->y),
 220        0.0f,
 221        0.0f
 222    };
 223    return vResult.v;
 224#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 225#endif // _XM_VMX128_INTRINSICS_
 226}
 227
 228//------------------------------------------------------------------------------
 229_Use_decl_annotations_
 230inline XMVECTOR PackedVector::XMLoadShortN2
 231(
 232    const XMSHORTN2* pSource
 233)
 234{
 235    assert(pSource);
 236#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 237    XMVECTORF32 vResult = {
 238        (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)),
 239        (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)),
 240        0.0f,
 241        0.0f
 242    };
 243    return vResult.v;
 244#elif defined(_XM_SSE_INTRINSICS_)
 245    // Splat the two shorts in all four entries (WORD alignment okay,
 246    // DWORD alignment preferred)
 247    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
 248    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
 249    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
 250    // x needs to be sign extended
 251    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
 252    // Convert to floating point numbers
 253    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 254    // x - 0x8000 to undo the signed order.
 255    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
 256    // Convert -1.0f - 1.0f
 257    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
 258    // Clamp result (for case of -32768)
 259    return _mm_max_ps( vTemp, g_XMNegativeOne );
 260#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 261#endif // _XM_VMX128_INTRINSICS_
 262}
 263
 264//------------------------------------------------------------------------------
 265_Use_decl_annotations_
 266inline XMVECTOR PackedVector::XMLoadShort2
 267(
 268    const XMSHORT2* pSource
 269)
 270{
 271    assert(pSource);
 272#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 273    XMVECTORF32 vResult = {
 274        (float)pSource->x,
 275        (float)pSource->y,
 276        0.f,
 277        0.f
 278    };
 279    return vResult.v;
 280#elif defined(_XM_SSE_INTRINSICS_)
 281    // Splat the two shorts in all four entries (WORD alignment okay,
 282    // DWORD alignment preferred)
 283    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
 284    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
 285    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
 286    // x needs to be sign extended
 287    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
 288    // Convert to floating point numbers
 289    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 290    // x - 0x8000 to undo the signed order.
 291    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
 292    // Y is 65536 too large
 293    return _mm_mul_ps(vTemp,g_XMFixupY16);
 294#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 295#endif // _XM_VMX128_INTRINSICS_
 296}
 297
 298//------------------------------------------------------------------------------
 299_Use_decl_annotations_
 300inline XMVECTOR PackedVector::XMLoadUShortN2
 301(
 302    const XMUSHORTN2* pSource
 303)
 304{
 305    assert(pSource);
 306#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 307    XMVECTORF32 vResult = {
 308        (float)pSource->x / 65535.0f,
 309        (float)pSource->y / 65535.0f,
 310        0.f,
 311        0.f
 312    };
 313    return vResult.v;
 314#elif defined(_XM_SSE_INTRINSICS_)
 315    static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
 316    static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
 317    // Splat the two shorts in all four entries (WORD alignment okay,
 318    // DWORD alignment preferred)
 319    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
 320    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
 321    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
 322    // y needs to be sign flipped
 323    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
 324    // Convert to floating point numbers
 325    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 326    // y + 0x8000 to undo the signed order.
 327    vTemp = _mm_add_ps(vTemp,FixaddY16);
 328    // Y is 65536 times too large
 329    vTemp = _mm_mul_ps(vTemp,FixupY16);
 330    return vTemp;
 331#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 332#endif // _XM_VMX128_INTRINSICS_
 333}
 334
 335//------------------------------------------------------------------------------
 336_Use_decl_annotations_
 337inline XMVECTOR PackedVector::XMLoadUShort2
 338(
 339    const XMUSHORT2* pSource
 340)
 341{
 342    assert(pSource);
 343#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 344    XMVECTORF32 vResult = {
 345        (float)pSource->x,
 346        (float)pSource->y,
 347        0.f,
 348        0.f
 349    };
 350    return vResult.v;
 351#elif defined(_XM_SSE_INTRINSICS_)
 352    static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
 353    // Splat the two shorts in all four entries (WORD alignment okay,
 354    // DWORD alignment preferred)
 355    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
 356    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
 357    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
 358    // y needs to be sign flipped
 359    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
 360    // Convert to floating point numbers
 361    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 362    // Y is 65536 times too large
 363    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
 364    // y + 0x8000 to undo the signed order.
 365    vTemp = _mm_add_ps(vTemp,FixaddY16);
 366    return vTemp;
 367#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 368#endif // _XM_VMX128_INTRINSICS_
 369}
 370
 371//------------------------------------------------------------------------------
 372_Use_decl_annotations_
 373inline XMVECTOR PackedVector::XMLoadByteN2
 374(
 375    const XMBYTEN2* pSource
 376)
 377{
 378    assert(pSource);
 379    XMVECTORF32 vResult = {
 380        (pSource->x == -128) ? -1.f : ((float)pSource->x * (1.0f/127.0f)),
 381        (pSource->y == -128) ? -1.f : ((float)pSource->y * (1.0f/127.0f)),
 382        0.0f,
 383        0.0f
 384    };
 385    return vResult.v;
 386}
 387
 388//------------------------------------------------------------------------------
 389_Use_decl_annotations_
 390inline XMVECTOR PackedVector::XMLoadByte2
 391(
 392    const XMBYTE2* pSource
 393)
 394{
 395    assert(pSource);
 396    XMVECTORF32 vResult = {
 397        (float)pSource->x,
 398        (float)pSource->y,
 399        0.0f,
 400        0.0f
 401    };
 402    return vResult.v;
 403}
 404
 405//------------------------------------------------------------------------------
 406_Use_decl_annotations_
 407inline XMVECTOR PackedVector::XMLoadUByteN2
 408(
 409    const XMUBYTEN2* pSource
 410)
 411{
 412    assert(pSource);
 413    XMVECTORF32 vResult = {
 414        (float)pSource->x * (1.0f/255.0f),
 415        (float)pSource->y * (1.0f/255.0f),
 416        0.0f,
 417        0.0f
 418    };
 419    return vResult.v;
 420}
 421
 422//------------------------------------------------------------------------------
 423_Use_decl_annotations_
 424inline XMVECTOR PackedVector::XMLoadUByte2
 425(
 426    const XMUBYTE2* pSource
 427)
 428{
 429    assert(pSource);
 430    XMVECTORF32 vResult = {
 431        (float)pSource->x,
 432        (float)pSource->y,
 433        0.0f,
 434        0.0f
 435    };
 436    return vResult.v;
 437}
 438
 439//------------------------------------------------------------------------------
 440_Use_decl_annotations_
 441inline XMVECTOR PackedVector::XMLoadU565
 442(
 443    const XMU565* pSource
 444)
 445{
 446    assert(pSource);
 447#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
 448    static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
 449    static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
 450    // Get the 32 bit value and splat it
 451    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
 452    // Mask off x, y and z
 453    vResult = _mm_and_ps(vResult,U565And);
 454    // Convert to float
 455    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
 456    // Normalize x, y, and z
 457    vResult = _mm_mul_ps(vResult,U565Mul);
 458    return vResult;
 459#else
 460    XMVECTORF32 vResult = {
 461        float(pSource->v & 0x1F),
 462        float((pSource->v >> 5) & 0x3F),
 463        float((pSource->v >> 11) & 0x1F),
 464        0.f,
 465    };
 466    return vResult.v;
 467#endif // !_XM_SSE_INTRINSICS_
 468}
 469
 470//------------------------------------------------------------------------------
 471_Use_decl_annotations_
 472inline XMVECTOR PackedVector::XMLoadFloat3PK
 473(
 474    const XMFLOAT3PK* pSource
 475)
 476{
 477    assert(pSource);
 478
 479    __declspec(align(16)) uint32_t Result[4];
 480    uint32_t Mantissa;
 481    uint32_t Exponent;
 482
 483    // X Channel (6-bit mantissa)
 484    Mantissa = pSource->xm;
 485
 486    if ( pSource->xe == 0x1f ) // INF or NAN
 487    {
 488        Result[0] = 0x7f800000 | (pSource->xm << 17);
 489    }
 490    else
 491    {
 492        if ( pSource->xe != 0 ) // The value is normalized
 493        {
 494            Exponent = pSource->xe;
 495        }
 496        else if (Mantissa != 0) // The value is denormalized
 497        {
 498            // Normalize the value in the resulting float
 499            Exponent = 1;
 500    
 501            do
 502            {
 503                Exponent--;
 504                Mantissa <<= 1;
 505            } while ((Mantissa & 0x40) == 0);
 506    
 507            Mantissa &= 0x3F;
 508        }
 509        else // The value is zero
 510        {
 511            Exponent = (uint32_t)-112;
 512        }
 513    
 514        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
 515    }
 516
 517    // Y Channel (6-bit mantissa)
 518    Mantissa = pSource->ym;
 519
 520    if ( pSource->ye == 0x1f ) // INF or NAN
 521    {
 522        Result[1] = 0x7f800000 | (pSource->ym << 17);
 523    }
 524    else
 525    {
 526        if ( pSource->ye != 0 ) // The value is normalized
 527        {
 528            Exponent = pSource->ye;
 529        }
 530        else if (Mantissa != 0) // The value is denormalized
 531        {
 532            // Normalize the value in the resulting float
 533            Exponent = 1;
 534    
 535            do
 536            {
 537                Exponent--;
 538                Mantissa <<= 1;
 539            } while ((Mantissa & 0x40) == 0);
 540    
 541            Mantissa &= 0x3F;
 542        }
 543        else // The value is zero
 544        {
 545            Exponent = (uint32_t)-112;
 546        }
 547    
 548        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
 549    }
 550
 551    // Z Channel (5-bit mantissa)
 552    Mantissa = pSource->zm;
 553
 554    if ( pSource->ze == 0x1f ) // INF or NAN
 555    {
 556        Result[2] = 0x7f800000 | (pSource->zm << 17);
 557    }
 558    else
 559    {
 560        if ( pSource->ze != 0 ) // The value is normalized
 561        {
 562            Exponent = pSource->ze;
 563        }
 564        else if (Mantissa != 0) // The value is denormalized
 565        {
 566            // Normalize the value in the resulting float
 567            Exponent = 1;
 568    
 569            do
 570            {
 571                Exponent--;
 572                Mantissa <<= 1;
 573            } while ((Mantissa & 0x20) == 0);
 574    
 575            Mantissa &= 0x1F;
 576        }
 577        else // The value is zero
 578        {
 579            Exponent = (uint32_t)-112;
 580        }
 581
 582        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
 583    }
 584
 585    return XMLoadFloat3A( reinterpret_cast<const XMFLOAT3A*>(&Result) );
 586}
 587
 588//------------------------------------------------------------------------------
 589_Use_decl_annotations_
 590inline XMVECTOR PackedVector::XMLoadFloat3SE
 591(
 592    const XMFLOAT3SE* pSource
 593)
 594{
 595    assert(pSource);
 596
 597    __declspec(align(16)) uint32_t Result[4];
 598    uint32_t Mantissa;
 599    uint32_t Exponent, ExpBits;
 600
 601    if ( pSource->e == 0x1f ) // INF or NAN
 602    {
 603        Result[0] = 0x7f800000 | (pSource->xm << 14);
 604        Result[1] = 0x7f800000 | (pSource->ym << 14);
 605        Result[2] = 0x7f800000 | (pSource->zm << 14);
 606    }
 607    else if ( pSource->e != 0 ) // The values are all normalized
 608    {
 609        Exponent = pSource->e;
 610
 611        ExpBits = (Exponent + 112) << 23;
 612
 613        Mantissa = pSource->xm;
 614        Result[0] = ExpBits | (Mantissa << 14);
 615
 616        Mantissa = pSource->ym;
 617        Result[1] = ExpBits | (Mantissa << 14);
 618
 619        Mantissa = pSource->zm;
 620        Result[2] = ExpBits | (Mantissa << 14);
 621    }
 622    else
 623    {
 624        // X Channel
 625        Mantissa = pSource->xm;
 626
 627        if (Mantissa != 0) // The value is denormalized
 628        {
 629            // Normalize the value in the resulting float
 630            Exponent = 1;
 631
 632            do
 633            {
 634                Exponent--;
 635                Mantissa <<= 1;
 636            } while ((Mantissa & 0x200) == 0);
 637
 638            Mantissa &= 0x1FF;
 639        }
 640        else // The value is zero
 641        {
 642            Exponent = (uint32_t)-112;
 643        }
 644
 645        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14);
 646
 647        // Y Channel
 648        Mantissa = pSource->ym;
 649
 650        if (Mantissa != 0) // The value is denormalized
 651        {
 652            // Normalize the value in the resulting float
 653            Exponent = 1;
 654
 655            do
 656            {
 657                Exponent--;
 658                Mantissa <<= 1;
 659            } while ((Mantissa & 0x200) == 0);
 660
 661            Mantissa &= 0x1FF;
 662        }
 663        else // The value is zero
 664        {
 665            Exponent = (uint32_t)-112;
 666        }
 667
 668        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14);
 669
 670        // Z Channel
 671        Mantissa = pSource->zm;
 672
 673        if (Mantissa != 0) // The value is denormalized
 674        {
 675            // Normalize the value in the resulting float
 676            Exponent = 1;
 677
 678            do
 679            {
 680                Exponent--;
 681                Mantissa <<= 1;
 682            } while ((Mantissa & 0x200) == 0);
 683
 684            Mantissa &= 0x1FF;
 685        }
 686        else // The value is zero
 687        {
 688            Exponent = (uint32_t)-112;
 689        }
 690
 691        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
 692    }
 693
 694    return XMLoadFloat3A( reinterpret_cast<const XMFLOAT3A*>(&Result) );
 695}
 696
 697//------------------------------------------------------------------------------
 698_Use_decl_annotations_
 699inline XMVECTOR PackedVector::XMLoadHalf4
 700(
 701    const XMHALF4* pSource
 702)
 703{
 704    assert(pSource);
 705#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) 
 706    XMVECTORF32 vResult = {
 707        XMConvertHalfToFloat(pSource->x),
 708        XMConvertHalfToFloat(pSource->y),
 709        XMConvertHalfToFloat(pSource->z),
 710        XMConvertHalfToFloat(pSource->w)
 711    };
 712    return vResult.v;
 713#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 714#endif // _XM_VMX128_INTRINSICS_
 715}
 716
 717//------------------------------------------------------------------------------
 718_Use_decl_annotations_
 719inline XMVECTOR PackedVector::XMLoadShortN4
 720(
 721    const XMSHORTN4* pSource
 722)
 723{
 724    assert(pSource);
 725#if defined(_XM_NO_INTRINSICS_)
 726    XMVECTORF32 vResult = {
 727        (pSource->x == -32768) ? -1.f : ((float)pSource->x * (1.0f/32767.0f)),
 728        (pSource->y == -32768) ? -1.f : ((float)pSource->y * (1.0f/32767.0f)),
 729        (pSource->z == -32768) ? -1.f : ((float)pSource->z * (1.0f/32767.0f)),
 730        (pSource->w == -32768) ? -1.f : ((float)pSource->w * (1.0f/32767.0f))
 731    };
 732    return vResult.v;
 733#elif defined(_XM_ARM_NEON_INTRINSICS_)
 734    __n64 vInt = vld1_s16( (const int16_t*)pSource );
 735    __n128 V = vmovl_s16( vInt );
 736    V = vcvtq_f32_s32( V );
 737    const __n128 Scale = vdupq_n_f32( 1.0f/32767.0f );
 738    V = vmulq_f32( V, Scale );
 739    return vmaxq_f32( V, g_XMNegativeOne );
 740#elif defined(_XM_SSE_INTRINSICS_)
 741    // Splat the color in all four entries (x,z,y,w)
 742    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
 743    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
 744    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
 745    // x and z are unsigned! Flip the bits to convert the order to signed
 746    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
 747    // Convert to floating point numbers
 748    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 749    // x and z - 0x8000 to complete the conversion
 750    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
 751    // Convert to -1.0f - 1.0f
 752    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
 753    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
 754    vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
 755    // Clamp result (for case of -32768)
 756    return _mm_max_ps( vTemp, g_XMNegativeOne );
 757#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 758#endif // _XM_VMX128_INTRINSICS_
 759}
 760
 761//------------------------------------------------------------------------------
 762_Use_decl_annotations_
 763inline XMVECTOR PackedVector::XMLoadShort4
 764(
 765    const XMSHORT4* pSource
 766)
 767{
 768    assert(pSource);
 769#if defined(_XM_NO_INTRINSICS_)
 770    XMVECTORF32 vResult = {
 771        (float)pSource->x,
 772        (float)pSource->y,
 773        (float)pSource->z,
 774        (float)pSource->w
 775    };
 776    return vResult.v;
 777#elif defined(_XM_ARM_NEON_INTRINSICS_)
 778    __n64 vInt = vld1_s16( (const int16_t*)pSource );
 779    __n128 V = vmovl_s16( vInt );
 780    return vcvtq_f32_s32( V );
 781#elif defined(_XM_SSE_INTRINSICS_)
 782    // Splat the color in all four entries (x,z,y,w)
 783    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
 784    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
 785    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
 786    // x and z are unsigned! Flip the bits to convert the order to signed
 787    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
 788    // Convert to floating point numbers
 789    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 790    // x and z - 0x8000 to complete the conversion
 791    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
 792    // Fix y and w because they are 65536 too large
 793    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
 794    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
 795    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
 796#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 797#endif // _XM_VMX128_INTRINSICS_
 798}
 799
 800//------------------------------------------------------------------------------
 801_Use_decl_annotations_
 802inline XMVECTOR PackedVector::XMLoadUShortN4
 803(
 804    const XMUSHORTN4* pSource
 805)
 806{
 807    assert(pSource);
 808#if defined(_XM_NO_INTRINSICS_)
 809    XMVECTORF32 vResult = {
 810        (float)pSource->x / 65535.0f,
 811        (float)pSource->y / 65535.0f,
 812        (float)pSource->z / 65535.0f,
 813        (float)pSource->w / 65535.0f
 814    };
 815    return vResult.v;
 816#elif defined(_XM_ARM_NEON_INTRINSICS_)
 817    __n64 vInt = vld1_u16( (const uint16_t*)pSource );
 818    __n128 V = vmovl_u16( vInt );
 819    V = vcvtq_f32_u32( V );
 820    const __n128 Scale = vdupq_n_f32( 1.0f/65535.0f );
 821    return vmulq_f32( V, Scale );
 822#elif defined(_XM_SSE_INTRINSICS_)
 823    static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
 824    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
 825    // Splat the color in all four entries (x,z,y,w)
 826    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
 827    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
 828    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
 829    // y and w are signed! Flip the bits to convert the order to unsigned
 830    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
 831    // Convert to floating point numbers
 832    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 833    // y and w + 0x8000 to complete the conversion
 834    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
 835    // Fix y and w because they are 65536 too large
 836    vTemp = _mm_mul_ps(vTemp,FixupY16W16);
 837    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
 838    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
 839#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 840#endif // _XM_VMX128_INTRINSICS_
 841}
 842
 843//------------------------------------------------------------------------------
 844_Use_decl_annotations_
 845inline XMVECTOR PackedVector::XMLoadUShort4
 846(
 847    const XMUSHORT4* pSource
 848)
 849{
 850    assert(pSource);
 851#if defined(_XM_NO_INTRINSICS_)
 852    XMVECTORF32 vResult = {
 853        (float)pSource->x,
 854        (float)pSource->y,
 855        (float)pSource->z,
 856        (float)pSource->w
 857    };
 858    return vResult.v;
 859#elif defined(_XM_ARM_NEON_INTRINSICS_)
 860    __n64 vInt = vld1_u16( (const uint16_t*)pSource );
 861    __n128 V = vmovl_u16( vInt );
 862    return vcvtq_f32_u32( V );
 863#elif defined(_XM_SSE_INTRINSICS_)
 864    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f,32768.0f};
 865    // Splat the color in all four entries (x,z,y,w)
 866    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
 867    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
 868    __m128 vTemp = _mm_and_ps(_mm_castpd_ps(vIntd),g_XMMaskX16Y16Z16W16);
 869    // y and w are signed! Flip the bits to convert the order to unsigned
 870    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
 871    // Convert to floating point numbers
 872    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 873    // Fix y and w because they are 65536 too large
 874    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
 875    // y and w + 0x8000 to complete the conversion
 876    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
 877    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
 878    return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(3,1,2,0));
 879#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 880#endif // _XM_VMX128_INTRINSICS_
 881}
 882
 883//------------------------------------------------------------------------------
 884_Use_decl_annotations_
 885inline XMVECTOR PackedVector::XMLoadXDecN4
 886(
 887    const XMXDECN4* pSource
 888)
 889{
 890    assert(pSource);
 891#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 892    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
 893
 894    uint32_t ElementX = pSource->v & 0x3FF;
 895    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
 896    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
 897
 898    XMVECTORF32 vResult = {
 899        (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f),
 900        (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f),
 901        (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
 902        (float)(pSource->v >> 30) / 3.0f
 903    };
 904    return vResult.v;
 905#elif defined(_XM_SSE_INTRINSICS_)
 906    // Splat the color in all four entries
 907    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
 908    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
 909    vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
 910    // a is unsigned! Flip the bit to convert the order to signed
 911    vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
 912    // Convert to floating point numbers
 913    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 914    // RGB + 0, A + 0x80000000.f to undo the signed order.
 915    vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
 916    // Convert 0-255 to 0.0f-1.0f
 917    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
 918    // Clamp result (for case of -512)
 919    return _mm_max_ps( vTemp, g_XMNegativeOne );
 920#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 921#endif // _XM_VMX128_INTRINSICS_
 922}
 923
 924//------------------------------------------------------------------------------
 925_Use_decl_annotations_
 926inline XMVECTOR PackedVector::XMLoadXDec4
 927(
 928    const XMXDEC4* pSource
 929)
 930{
 931    assert(pSource);
 932#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 933    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
 934
 935    uint32_t ElementX = pSource->v & 0x3FF;
 936    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
 937    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
 938
 939    XMVECTORF32 vResult = {
 940        (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]),
 941        (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]),
 942        (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]),
 943        (float)(pSource->v >> 30)
 944    };
 945    return vResult.v;
 946#elif defined(_XM_SSE_INTRINSICS_)
 947    static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
 948    static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
 949    // Splat the color in all four entries
 950    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
 951    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
 952    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
 953    // a is unsigned! Flip the bit to convert the order to signed
 954    vTemp = _mm_xor_ps(vTemp,XDec4Xor);
 955    // Convert to floating point numbers
 956    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 957    // RGB + 0, A + 0x80000000.f to undo the signed order.
 958    vTemp = _mm_add_ps(vTemp,XDec4Add);
 959    // Convert 0-255 to 0.0f-1.0f
 960    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
 961    return vTemp;
 962#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
 963#endif // _XM_VMX128_INTRINSICS_
 964}
 965
 966//------------------------------------------------------------------------------
 967_Use_decl_annotations_
 968inline XMVECTOR PackedVector::XMLoadUDecN4
 969(
 970    const XMUDECN4* pSource
 971)
 972{
 973    assert(pSource);
 974#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
 975
 976    uint32_t ElementX = pSource->v & 0x3FF;
 977    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
 978    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
 979
 980    XMVECTORF32 vResult = {
 981        (float)ElementX / 1023.0f,
 982        (float)ElementY / 1023.0f,
 983        (float)ElementZ / 1023.0f,
 984        (float)(pSource->v >> 30) / 3.0f
 985    };
 986    return vResult.v;
 987#elif defined(_XM_SSE_INTRINSICS_)
 988    static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
 989    // Splat the color in all four entries
 990    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
 991    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
 992    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
 993    // a is unsigned! Flip the bit to convert the order to signed
 994    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
 995    // Convert to floating point numbers
 996    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
 997    // RGB + 0, A + 0x80000000.f to undo the signed order.
 998    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
 999    // Convert 0-255 to 0.0f-1.0f
1000    vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
1001    return vTemp;
1002#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1003#endif // _XM_VMX128_INTRINSICS_
1004}
1005
1006//------------------------------------------------------------------------------
1007_Use_decl_annotations_
1008inline XMVECTOR PackedVector::XMLoadUDec4
1009(
1010    const XMUDEC4* pSource
1011)
1012{
1013    assert(pSource);
1014#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1015    uint32_t ElementX = pSource->v & 0x3FF;
1016    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
1017    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
1018
1019    XMVECTORF32 vResult = {
1020        (float)ElementX,
1021        (float)ElementY,
1022        (float)ElementZ,
1023        (float)(pSource->v >> 30)
1024    };
1025    return vResult.v;
1026#elif defined(_XM_SSE_INTRINSICS_)
1027    // Splat the color in all four entries
1028    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1029    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
1030    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
1031    // a is unsigned! Flip the bit to convert the order to signed
1032    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
1033    // Convert to floating point numbers
1034    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
1035    // RGB + 0, A + 0x80000000.f to undo the signed order.
1036    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
1037    // Convert 0-255 to 0.0f-1.0f
1038    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
1039    return vTemp;
1040#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1041#endif // _XM_VMX128_INTRINSICS_
1042}
1043
1044//------------------------------------------------------------------------------
1045_Use_decl_annotations_
1046inline XMVECTOR PackedVector::XMLoadDecN4
1047(
1048    const XMDECN4* pSource
1049)
1050{
1051    assert(pSource);
1052#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1053    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
1054    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
1055
1056    uint32_t ElementX = pSource->v & 0x3FF;
1057    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
1058    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
1059    uint32_t ElementW = pSource->v >> 30;
1060
1061    XMVECTORF32 vResult = {
1062        (ElementX == 0x200) ? -1.f : ((float)(int16_t)(ElementX | SignExtend[ElementX >> 9]) / 511.0f),
1063        (ElementY == 0x200) ? -1.f : ((float)(int16_t)(ElementY | SignExtend[ElementY >> 9]) / 511.0f),
1064        (ElementZ == 0x200) ? -1.f : ((float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]) / 511.0f),
1065        (ElementW == 0x2)   ? -1.f : ((float)(int16_t)(ElementW | SignExtendW[(ElementW >> 1) & 1]))
1066    };
1067    return vResult.v;
1068#elif defined(_XM_SSE_INTRINSICS_)
1069    static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
1070    // Splat the color in all four entries
1071    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1072    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
1073    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
1074    // a is unsigned! Flip the bit to convert the order to signed
1075    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
1076    // Convert to floating point numbers
1077    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
1078    // RGB + 0, A + 0x80000000.f to undo the signed order.
1079    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
1080    // Convert 0-255 to 0.0f-1.0f
1081    vTemp = _mm_mul_ps(vTemp,DecN4Mul);
1082    // Clamp result (for case of -512/-1)
1083    return _mm_max_ps( vTemp, g_XMNegativeOne );
1084#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1085#endif // _XM_VMX128_INTRINSICS_
1086}
1087
1088//------------------------------------------------------------------------------
1089_Use_decl_annotations_
1090inline XMVECTOR PackedVector::XMLoadDec4
1091(
1092    const XMDEC4* pSource
1093)
1094{
1095    assert(pSource);
1096#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1097    static const uint32_t SignExtend[] = {0x00000000, 0xFFFFFC00};
1098    static const uint32_t SignExtendW[] = {0x00000000, 0xFFFFFFFC};
1099
1100    uint32_t ElementX = pSource->v & 0x3FF;
1101    uint32_t ElementY = (pSource->v >> 10) & 0x3FF;
1102    uint32_t ElementZ = (pSource->v >> 20) & 0x3FF;
1103    uint32_t ElementW = pSource->v >> 30;
1104
1105    XMVECTORF32 vResult = {
1106        (float)(int16_t)(ElementX | SignExtend[ElementX >> 9]),
1107        (float)(int16_t)(ElementY | SignExtend[ElementY >> 9]),
1108        (float)(int16_t)(ElementZ | SignExtend[ElementZ >> 9]),
1109        (float)(int16_t)(ElementW | SignExtendW[ElementW >> 1])
1110    };
1111    return vResult.v;
1112#elif defined(_XM_SSE_INTRINSICS_)
1113    // Splat the color in all four entries
1114    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1115    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
1116    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
1117    // a is unsigned! Flip the bit to convert the order to signed
1118    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
1119    // Convert to floating point numbers
1120    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
1121    // RGB + 0, A + 0x80000000.f to undo the signed order.
1122    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
1123    // Convert 0-255 to 0.0f-1.0f
1124    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
1125    return vTemp;
1126#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1127#endif // _XM_VMX128_INTRINSICS_
1128}
1129
1130//------------------------------------------------------------------------------
1131_Use_decl_annotations_
1132inline XMVECTOR PackedVector::XMLoadUByteN4
1133(
1134    const XMUBYTEN4* pSource
1135)
1136{
1137    assert(pSource);
1138#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1139    XMVECTORF32 vResult = {
1140        (float)pSource->x / 255.0f,
1141        (float)pSource->y / 255.0f,
1142        (float)pSource->z / 255.0f,
1143        (float)pSource->w / 255.0f
1144    };
1145    return vResult.v;
1146#elif defined(_XM_SSE_INTRINSICS_)
1147    static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
1148    // Splat the color in all four entries (x,z,y,w)
1149    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
1150    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
1151    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
1152    // w is signed! Flip the bits to convert the order to unsigned
1153    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
1154    // Convert to floating point numbers
1155    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
1156    // w + 0x80 to complete the conversion
1157    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
1158    // Fix y, z and w because they are too large
1159    vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
1160    return vTemp;
1161#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1162#endif // _XM_VMX128_INTRINSICS_
1163}
1164
1165//------------------------------------------------------------------------------
1166_Use_decl_annotations_
1167inline XMVECTOR PackedVector::XMLoadUByte4
1168(
1169    const XMUBYTE4* pSource
1170)
1171{
1172    assert(pSource);
1173#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1174    XMVECTORF32 vResult = {
1175        (float)pSource->x,
1176        (float)pSource->y,
1177        (float)pSource->z,
1178        (float)pSource->w
1179    };
1180    return vResult.v;
1181#elif defined(_XM_SSE_INTRINSICS_)
1182    static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
1183    // Splat the color in all four entries (x,z,y,w)
1184    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
1185    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
1186    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
1187    // w is signed! Flip the bits to convert the order to unsigned
1188    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
1189    // Convert to floating point numbers
1190    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
1191    // w + 0x80 to complete the conversion
1192    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
1193    // Fix y, z and w because they are too large
1194    vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
1195    return vTemp;
1196#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1197#endif // _XM_VMX128_INTRINSICS_
1198}
1199
1200//------------------------------------------------------------------------------
1201_Use_decl_annotations_
1202inline XMVECTOR PackedVector::XMLoadByteN4
1203(
1204    const XMBYTEN4* pSource
1205)
1206{
1207    assert(pSource);
1208#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1209    XMVECTORF32 vResult = {
1210        (pSource->x == -128) ? -1.f : ((float)pSource->x / 127.0f),
1211        (pSource->y == -128) ? -1.f : ((float)pSource->y / 127.0f),
1212        (pSource->z == -128) ? -1.f : ((float)pSource->z / 127.0f),
1213        (pSource->w == -128) ? -1.f : ((float)pSource->w / 127.0f)
1214    };
1215    return vResult.v;
1216#elif defined(_XM_SSE_INTRINSICS_)
1217    static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
1218    // Splat the color in all four entries (x,z,y,w)
1219    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
1220    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
1221    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
1222    // x,y and z are unsigned! Flip the bits to convert the order to signed
1223    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
1224    // Convert to floating point numbers
1225    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
1226    // x, y and z - 0x80 to complete the conversion
1227    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
1228    // Fix y, z and w because they are too large
1229    vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
1230    // Clamp result (for case of -128)
1231    return _mm_max_ps( vTemp, g_XMNegativeOne );
1232#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1233#endif // _XM_VMX128_INTRINSICS_
1234}
1235
1236//------------------------------------------------------------------------------
1237_Use_decl_annotations_
1238inline XMVECTOR PackedVector::XMLoadByte4
1239(
1240    const XMBYTE4* pSource
1241)
1242{
1243    assert(pSource);
1244#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1245    XMVECTORF32 vResult = {
1246        (float)pSource->x,
1247        (float)pSource->y,
1248        (float)pSource->z,
1249        (float)pSource->w
1250    };
1251    return vResult.v;
1252#elif defined(_XM_SSE_INTRINSICS_)
1253    static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
1254    // Splat the color in all four entries (x,z,y,w)
1255    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
1256    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
1257    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
1258    // x,y and z are unsigned! Flip the bits to convert the order to signed
1259    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
1260    // Convert to floating point numbers
1261    vTemp = _mm_cvtepi32_ps(_mm_castps_si128(vTemp));
1262    // x, y and z - 0x80 to complete the conversion
1263    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
1264    // Fix y, z and w because they are too large
1265    vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
1266    return vTemp;
1267#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1268#endif // _XM_VMX128_INTRINSICS_
1269}
1270
1271//------------------------------------------------------------------------------
1272_Use_decl_annotations_
1273inline XMVECTOR PackedVector::XMLoadUNibble4
1274(
1275     const XMUNIBBLE4* pSource
1276)
1277{
1278    assert(pSource);
1279#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1280    static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
1281    static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
1282    // Get the 32 bit value and splat it
1283    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1284    // Mask off x, y and z
1285    vResult = _mm_and_ps(vResult,UNibble4And);
1286    // Convert to float
1287    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
1288    // Normalize x, y, and z
1289    vResult = _mm_mul_ps(vResult,UNibble4Mul);
1290    return vResult;
1291#else
1292    XMVECTORF32 vResult = {
1293        float(pSource->v & 0xF),
1294        float((pSource->v >> 4) & 0xF),
1295        float((pSource->v >> 8) & 0xF),
1296        float((pSource->v >> 12) & 0xF)
1297    };
1298    return vResult.v;
1299#endif // !_XM_SSE_INTRISICS_
1300}
1301
1302//------------------------------------------------------------------------------
1303_Use_decl_annotations_
1304inline XMVECTOR PackedVector::XMLoadU555
1305(
1306     const XMU555* pSource
1307)
1308{
1309    assert(pSource);
1310#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1311    static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
1312    static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
1313    // Get the 32 bit value and splat it
1314    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1315    // Mask off x, y and z
1316    vResult = _mm_and_ps(vResult,U555And);
1317    // Convert to float
1318    vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
1319    // Normalize x, y, and z
1320    vResult = _mm_mul_ps(vResult,U555Mul);
1321    return vResult;
1322#else
1323    XMVECTORF32 vResult = {
1324        float(pSource->v & 0x1F),
1325        float((pSource->v >> 5) & 0x1F),
1326        float((pSource->v >> 10) & 0x1F),
1327        float((pSource->v >> 15) & 0x1)
1328    };
1329    return vResult.v;
1330#endif // !_XM_SSE_INTRISICS_
1331}
1332
1333
1334/****************************************************************************
1335 *
1336 * Vector and matrix store operations
1337 *
1338 ****************************************************************************/
1339_Use_decl_annotations_
1340inline void PackedVector::XMStoreColor
1341(
1342    XMCOLOR* pDestination, 
1343    FXMVECTOR V
1344)
1345{
1346    assert(pDestination);
1347#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1348
1349    static const XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
1350
1351    XMVECTOR N = XMVectorSaturate(V);
1352    N = XMVectorMultiply(N, Scale.v);
1353    N = XMVectorRound(N);
1354
1355    XMFLOAT4A tmp;
1356    XMStoreFloat4A( &tmp, N );
1357
1358    pDestination->c = ((uint32_t)tmp.w << 24) |
1359                      ((uint32_t)tmp.x << 16) |
1360                      ((uint32_t)tmp.y <<  8) |
1361                      ((uint32_t)tmp.z);
1362
1363#elif defined(_XM_SSE_INTRINSICS_)
1364    static const XMVECTORF32  Scale = {255.0f,255.0f,255.0f,255.0f};
1365    // Set <0 to 0
1366    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1367    // Set>1 to 1
1368    vResult = _mm_min_ps(vResult,g_XMOne);
1369    // Convert to 0-255
1370    vResult = _mm_mul_ps(vResult,Scale);
1371    // Shuffle RGBA to ARGB
1372    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
1373    // Convert to int 
1374    __m128i vInt = _mm_cvtps_epi32(vResult);
1375    // Mash to shorts
1376    vInt = _mm_packs_epi32(vInt,vInt);
1377    // Mash to bytes
1378    vInt = _mm_packus_epi16(vInt,vInt);
1379    // Store the color
1380    _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
1381#else // _XM_VMX128_INTRINSICS_
1382#endif // _XM_VMX128_INTRINSICS_
1383}
1384
1385//------------------------------------------------------------------------------
1386_Use_decl_annotations_
1387inline void PackedVector::XMStoreHalf2
1388(
1389    XMHALF2* pDestination, 
1390    FXMVECTOR V
1391)
1392{
1393    assert(pDestination);
1394#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1395
1396    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
1397    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
1398
1399#else // _XM_VMX128_INTRINSICS_
1400#endif // _XM_VMX128_INTRINSICS_
1401}
1402
1403//------------------------------------------------------------------------------
1404_Use_decl_annotations_
1405inline void PackedVector::XMStoreShortN2
1406(
1407    XMSHORTN2* pDestination, 
1408    FXMVECTOR V
1409)
1410{
1411    assert(pDestination);
1412#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1413
1414    static const XMVECTORF32  Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1415
1416    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
1417    N = XMVectorMultiply(N, Scale.v);
1418    N = XMVectorRound(N);
1419
1420    XMFLOAT4A tmp;
1421    XMStoreFloat4A( &tmp, N );
1422
1423    pDestination->x = (int16_t)tmp.x;
1424    pDestination->y = (int16_t)tmp.y;
1425
1426#elif defined(_XM_SSE_INTRINSICS_)
1427    static const XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1428
1429    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
1430    vResult = _mm_min_ps(vResult,g_XMOne);
1431    vResult = _mm_mul_ps(vResult,Scale);
1432    __m128i vResulti = _mm_cvtps_epi32(vResult);
1433    vResulti = _mm_packs_epi32(vResulti,vResulti);
1434    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vResulti));
1435#else // _XM_VMX128_INTRINSICS_
1436#endif // _XM_VMX128_INTRINSICS_
1437}
1438
1439//------------------------------------------------------------------------------
1440_Use_decl_annotations_
1441inline void PackedVector::XMStoreShort2
1442(
1443    XMSHORT2* pDestination, 
1444    FXMVECTOR V
1445)
1446{
1447    assert(pDestination);
1448#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1449
1450    static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
1451    static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1452
1453    XMVECTOR N = XMVectorClamp(V, Min, Max);
1454    N = XMVectorRound(N);
1455
1456    XMFLOAT4A tmp;
1457    XMStoreFloat4A( &tmp, N );
1458
1459    pDestination->x = (int16_t)tmp.x;
1460    pDestination->y = (int16_t)tmp.y;
1461
1462#elif defined(_XM_SSE_INTRINSICS_)
1463    static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
1464    static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1465    // Bounds check
1466    XMVECTOR vResult = _mm_max_ps(V,Min);
1467    vResult = _mm_min_ps(vResult,Max);
1468     // Convert to int with rounding
1469    __m128i vInt = _mm_cvtps_epi32(vResult);
1470    // Pack the ints into shorts
1471    vInt = _mm_packs_epi32(vInt,vInt);
1472    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),_mm_castsi128_ps(vInt));
1473#else // _XM_VMX128_INTRINSICS_
1474#endif // _XM_VMX128_INTRINSICS_
1475}
1476
1477//------------------------------------------------------------------------------
1478_Use_decl_annotations_
1479inline void PackedVector::XMStoreUShortN2
1480(
1481    XMUSHORTN2* pDestination, 
1482    FXMVECTOR V
1483)
1484{
1485    assert(pDestination);
1486#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1487
1488    static const XMVECTORF32  Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
1489
1490    XMVECTOR N = XMVectorSaturate(V);
1491    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
1492    N = XMVectorTruncate(N);
1493
1494    XMFLOAT4A tmp;
1495    XMStoreFloat4A( &tmp, N );
1496
1497    pDestination->x = (int16_t)tmp.x;
1498    pDestination->y = (int16_t)tmp.y;
1499
1500#elif defined(_XM_SSE_INTRINSICS_)
1501    static const XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
1502    // Bounds check
1503    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1504    vResult = _mm_min_ps(vResult,g_XMOne);
1505    vResult = _mm_mul_ps(vResult,Scale);
1506     // Convert to int with rounding
1507    __m128i vInt = _mm_cvtps_epi32(vResult);
1508    // Since the SSE pack instruction clamps using signed rules,
1509    // manually extract the values to store them to memory
1510    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
1511    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
1512#else // _XM_VMX128_INTRINSICS_
1513#endif // _XM_VMX128_INTRINSICS_
1514}
1515
1516//------------------------------------------------------------------------------
1517_Use_decl_annotations_
1518inline void PackedVector::XMStoreUShort2
1519(
1520    XMUSHORT2* pDestination, 
1521    FXMVECTOR V
1522)
1523{
1524    assert(pDestination);
1525#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1526
1527    static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
1528
1529    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max);
1530    N = XMVectorRound(N);
1531
1532    XMFLOAT4A tmp;
1533    XMStoreFloat4A( &tmp, N );
1534
1535    pDestination->x = (int16_t)tmp.x;
1536    pDestination->y = (int16_t)tmp.y;
1537
1538#elif defined(_XM_SSE_INTRINSICS_)
1539    static const XMVECTORF32  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
1540    // Bounds check
1541    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1542    vResult = _mm_min_ps(vResult,Max);
1543     // Convert to int with rounding
1544    __m128i vInt = _mm_cvtps_epi32(vResult);
1545    // Since the SSE pack instruction clamps using signed rules,
1546    // manually extract the values to store them to memory
1547    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
1548    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
1549#else // _XM_VMX128_INTRINSICS_
1550#endif // _XM_VMX128_INTRINSICS_
1551}
1552
1553//------------------------------------------------------------------------------
1554_Use_decl_annotations_
1555inline void PackedVector::XMStoreByteN2
1556(
1557    XMBYTEN2* pDestination, 
1558    FXMVECTOR V
1559)
1560{
1561    assert(pDestination);
1562
1563    static const XMVECTORF32  Scale = {127.0f, 127.0f, 127.0f, 127.0f};
1564
1565    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
1566    N = XMVectorMultiply(N, Scale.v);
1567    N = XMVectorRound(N);
1568
1569    XMFLOAT4A tmp;
1570    XMStoreFloat4A( &tmp, N );
1571
1572    pDestination->x = (int8_t)tmp.x;
1573    pDestination->y = (int8_t)tmp.y;
1574}
1575
1576//------------------------------------------------------------------------------
1577_Use_decl_annotations_
1578inline void PackedVector::XMStoreByte2
1579(
1580    XMBYTE2* pDestination, 
1581    FXMVECTOR V
1582)
1583{
1584    assert(pDestination);
1585
1586    static const XMVECTORF32 Min = {-127.0f, -127.0f, -127.0f, -127.0f};
1587    static const XMVECTORF32 Max = {127.0f, 127.0f, 127.0f, 127.0f};
1588
1589    XMVECTOR N = XMVectorClamp(V, Min, Max);
1590    N = XMVectorRound(N);
1591
1592    XMFLOAT4A tmp;
1593    XMStoreFloat4A( &tmp, N );
1594
1595    pDestination->x = (int8_t)tmp.x;
1596    pDestination->y = (int8_t)tmp.y;
1597}
1598
1599//------------------------------------------------------------------------------
1600_Use_decl_annotations_
1601inline void PackedVector::XMStoreUByteN2
1602(
1603    XMUBYTEN2* pDestination, 
1604    FXMVECTOR V
1605)
1606{
1607    assert(pDestination);
1608
1609    static const XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
1610
1611    XMVECTOR N = XMVectorSaturate(V);
1612    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
1613    N = XMVectorTruncate(N);
1614
1615    XMFLOAT4A tmp;
1616    XMStoreFloat4A( &tmp, N );
1617
1618    pDestination->x = (uint8_t)tmp.x;
1619    pDestination->y = (uint8_t)tmp.y;
1620}
1621
1622//------------------------------------------------------------------------------
1623_Use_decl_annotations_
1624inline void PackedVector::XMStoreUByte2
1625(
1626    XMUBYTE2* pDestination, 
1627    FXMVECTOR V
1628)
1629{
1630    assert(pDestination);
1631
1632    static const XMVECTORF32 Max = {255.0f, 255.0f, 255.0f, 255.0f};
1633
1634    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max);
1635    N = XMVectorRound(N);
1636
1637    XMFLOAT4A tmp;
1638    XMStoreFloat4A( &tmp, N );
1639
1640    pDestination->x = (uint8_t)tmp.x;
1641    pDestination->y = (uint8_t)tmp.y;
1642}
1643
1644//------------------------------------------------------------------------------
1645_Use_decl_annotations_
1646inline void PackedVector::XMStoreU565
1647(
1648    XMU565* pDestination,
1649    FXMVECTOR V
1650)
1651{
1652    assert(pDestination);
1653#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1654    static const XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
1655    // Bounds check
1656    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1657    vResult = _mm_min_ps(vResult,Max);
1658     // Convert to int with rounding
1659    __m128i vInt = _mm_cvtps_epi32(vResult);
1660    // No SSE operations will write to 16-bit values, so we have to extract them manually
1661    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
1662    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
1663    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
1664    pDestination->v = ((z & 0x1F) << 11) |
1665                      ((y & 0x3F) << 5) |
1666                      ((x & 0x1F));
1667#else
1668    static const XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
1669
1670    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
1671    N = XMVectorRound(N);
1672
1673    XMFLOAT4A tmp;
1674    XMStoreFloat4A( &tmp, N );
1675
1676    pDestination->v = (((uint16_t)tmp.z & 0x1F) << 11) |
1677                      (((uint16_t)tmp.y & 0x3F) << 5) |
1678                      (((uint16_t)tmp.x & 0x1F));
1679#endif !_XM_SSE_INTRINSICS_
1680}
1681
1682//------------------------------------------------------------------------------
1683_Use_decl_annotations_
1684inline void PackedVector::XMStoreFloat3PK
1685(
1686    XMFLOAT3PK* pDestination,
1687    FXMVECTOR V
1688)
1689{
1690    assert(pDestination);
1691
1692    __declspec(align(16)) uint32_t IValue[4];
1693    XMStoreFloat3A( reinterpret_cast<XMFLOAT3A*>(&IValue), V );
1694
1695    uint32_t Result[3];
1696
1697    // X & Y Channels (5-bit exponent, 6-bit mantissa)
1698    for(uint32_t j=0; j < 2; ++j)
1699    {
1700        uint32_t Sign = IValue[j] & 0x80000000;
1701        uint32_t I = IValue[j] & 0x7FFFFFFF;
1702
1703        if ((I & 0x7F800000) == 0x7F800000)
1704        {
1705            // INF or NAN
1706            Result[j] = 0x7c0;
1707            if (( I & 0x7FFFFF ) != 0)
1708            {
1709                Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f);
1710            }
1711            else if ( Sign )
1712            {
1713                // -INF is clamped to 0 since 3PK is positive only
1714                Result[j] = 0;
1715            }
1716        }
1717        else if ( Sign )
1718        {
1719            // 3PK is positive only, so clamp to zero
1720            Result[j] = 0;
1721        }
1722        else if (I > 0x477E0000U)
1723        {
1724            // The number is too large to be represented as a float11, set to max
1725            Result[j] = 0x7BF;
1726        }
1727        else
1728        {
1729            if (I < 0x38800000U)
1730            {
1731                // The number is too small to be represented as a normalized float11
1732                // Convert it to a denormalized value.
1733                uint32_t Shift = 113U - (I >> 23U);
1734                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
1735            }
1736            else
1737            {
1738                // Rebias the exponent to represent the value as a normalized float11
1739                I += 0xC8000000U;
1740            }
1741     
1742            Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
1743        }
1744    }
1745
1746    // Z Channel (5-bit exponent, 5-bit mantissa)
1747    uint32_t Sign = IValue[2] & 0x80000000;
1748    uint32_t I = IValue[2] & 0x7FFFFFFF;
1749
1750    if ((I & 0x7F800000) == 0x7F800000)
1751    {
1752        // INF or NAN
1753        Result[2] = 0x3e0;
1754        if ( I & 0x7FFFFF )
1755        {
1756            Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f);
1757        }
1758        else if ( Sign )
1759        {
1760            // -INF is clamped to 0 since 3PK is positive only
1761            Result[2] = 0;
1762        }
1763    }
1764    else if ( Sign )
1765    {
1766        // 3PK is positive only, so clamp to zero
1767        Result[2] = 0;
1768    }
1769    else if (I > 0x477C0000U)
1770    {
1771        // The number is too large to be represented as a float10, set to max
1772        Result[2] = 0x3df;
1773    }
1774    else
1775    {
1776        if (I < 0x38800000U)
1777        {
1778            // The number is too small to be represented as a normalized float10
1779            // Convert it to a denormalized value.
1780            uint32_t Shift = 113U - (I >> 23U);
1781            I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
1782        }
1783        else
1784        {
1785            // Rebias the exponent to represent the value as a normalized float10
1786            I += 0xC8000000U;
1787        }
1788     
1789        Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
1790    }
1791
1792    // Pack Result into memory
1793    pDestination->v = (Result[0] & 0x7ff)
1794                      | ( (Result[1] & 0x7ff) << 11 )
1795                      | ( (Result[2] & 0x3ff) << 22 );
1796}
1797
1798//------------------------------------------------------------------------------
1799_Use_decl_annotations_
1800inline void PackedVector::XMStoreFloat3SE
1801(
1802    XMFLOAT3SE* pDestination,
1803    FXMVECTOR V
1804)
1805{
1806    assert(pDestination);
1807
1808    __declspec(align(16)) uint32_t IValue[4];
1809    XMStoreFloat3A( reinterpret_cast<XMFLOAT3A*>(&IValue), V );
1810
1811    uint32_t Exp[3];
1812    uint32_t Frac[3];
1813
1814    // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
1815    for(uint32_t j=0; j < 3; ++j)
1816    {
1817        uint32_t Sign = IValue[j] & 0x80000000;
1818        uint32_t I = IValue[j] & 0x7FFFFFFF;
1819
1820        if ((I & 0x7F800000) == 0x7F800000)
1821        {
1822            // INF or NAN
1823            Exp[j] = 0x1f;
1824            if (( I & 0x7FFFFF ) != 0)
1825            {
1826                Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff;
1827            }
1828            else if ( Sign )
1829            {
1830                // -INF is clamped to 0 since 3SE is positive only
1831                Exp[j] = Frac[j] = 0;
1832            }
1833        }
1834        else if ( Sign )
1835        {
1836            // 3SE is positive only, so clamp to zero
1837            Exp[j] = Frac[j] = 0;
1838        }
1839        else if (I > 0x477FC000U)
1840        {
1841            // The number is too large, set to max
1842            Exp[j] = 0x1e;
1843            Frac[j] = 0x1ff;
1844        }
1845        else
1846        {
1847            if (I < 0x38800000U)
1848            {
1849                // The number is too small to be represented as a normalized float11
1850                // Convert it to a denormalized value.
1851                uint32_t Shift = 113U - (I >> 23U);
1852                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
1853            }
1854            else
1855            {
1856                // Rebias the exponent to represent the value as a normalized float11
1857                I += 0xC8000000U;
1858            }
1859     
1860            uint32_t T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU;
1861
1862            Exp[j] = (T & 0x3E00) >> 9;
1863            Frac[j] = T & 0x1ff;
1864        }
1865    }
1866
1867    // Adjust to a shared exponent
1868    uint32_t T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) );
1869
1870    Frac[0] = Frac[0] >> (T - Exp[0]);
1871    Frac[1] = Frac[1] >> (T - Exp[1]);
1872    Frac[2] = Frac[2] >> (T - Exp[2]);
1873
1874    // Store packed into memory
1875    pDestination->xm = Frac[0];
1876    pDestination->ym = Frac[1];
1877    pDestination->zm = Frac[2];
1878    pDestination->e = T;
1879}
1880
1881//------------------------------------------------------------------------------
1882_Use_decl_annotations_
1883inline void PackedVector::XMStoreHalf4
1884(
1885    XMHALF4* pDestination, 
1886    FXMVECTOR V
1887)
1888{
1889    assert(pDestination);
1890#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
1891 
1892    XMFLOAT4A t;
1893    XMStoreFloat4A(&t, V );
1894
1895    pDestination->x = XMConvertFloatToHalf(t.x);
1896    pDestination->y = XMConvertFloatToHalf(t.y);
1897    pDestination->z = XMConvertFloatToHalf(t.z);
1898    pDestination->w = XMConvertFloatToHalf(t.w);
1899
1900#else // _XM_VMX128_INTRINSICS_
1901#endif // _XM_VMX128_INTRINSICS_
1902}
1903
1904//------------------------------------------------------------------------------
1905_Use_decl_annotations_
1906inline void PackedVector::XMStoreShortN4
1907(
1908    XMSHORTN4* pDestination, 
1909    FXMVECTOR V
1910)
1911{
1912    assert(pDestination);
1913#if defined(_XM_NO_INTRINSICS_)
1914
1915    static const XMVECTORF32  Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1916
1917    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
1918    N = XMVectorMultiply(N, Scale.v);
1919    N = XMVectorRound(N);
1920
1921    XMFLOAT4A tmp;
1922    XMStoreFloat4A(&tmp, N );
1923
1924    pDestination->x = (int16_t)tmp.x;
1925    pDestination->y = (int16_t)tmp.y;
1926    pDestination->z = (int16_t)tmp.z;
1927    pDestination->w = (int16_t)tmp.w;
1928
1929#elif defined(_XM_ARM_NEON_INTRINSICS_)
1930    __n128 vResult = vmaxq_f32( V, g_XMNegativeOne );
1931    vResult = vminq_f32( vResult, g_XMOne );
1932    const __n128 Scale = vdupq_n_f32( 32767.0f );
1933    vResult = vmulq_f32( vResult, Scale );
1934    vResult = vcvtq_s32_f32( vResult );
1935    __n64 vInt = vmovn_s32( vResult );
1936    vst1_s16( (int16_t*)pDestination, vInt );
1937#elif defined(_XM_SSE_INTRINSICS_)
1938    static const XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1939
1940    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
1941    vResult = _mm_min_ps(vResult,g_XMOne);
1942    vResult = _mm_mul_ps(vResult,Scale);
1943    __m128i vResulti = _mm_cvtps_epi32(vResult);
1944    vResulti = _mm_packs_epi32(vResulti,vResulti);
1945    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vResulti));
1946#else // _XM_VMX128_INTRINSICS_
1947#endif // _XM_VMX128_INTRINSICS_
1948}
1949
1950//------------------------------------------------------------------------------
1951_Use_decl_annotations_
1952inline void PackedVector::XMStoreShort4
1953(
1954    XMSHORT4* pDestination, 
1955    FXMVECTOR V
1956)
1957{
1958    assert(pDestination);
1959#if defined(_XM_NO_INTRINSICS_)
1960
1961    static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
1962    static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1963
1964    XMVECTOR N = XMVectorClamp(V, Min, Max);
1965    N = XMVectorRound(N);
1966
1967    XMFLOAT4A tmp;
1968    XMStoreFloat4A(&tmp, N );
1969
1970    pDestination->x = (int16_t)tmp.x;
1971    pDestination->y = (int16_t)tmp.y;
1972    pDestination->z = (int16_t)tmp.z;
1973    pDestination->w = (int16_t)tmp.w;
1974
1975#elif defined(_XM_ARM_NEON_INTRINSICS_)
1976    static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
1977    static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1978
1979    __n128 vResult = vmaxq_f32( V, Min );
1980    vResult = vminq_f32( vResult, Max );
1981    vResult = vcvtq_s32_f32( vResult );
1982    __n64 vInt = vmovn_s32( vResult );
1983    vst1_s16( (int16_t*)pDestination, vInt );
1984#elif defined(_XM_SSE_INTRINSICS_)
1985    static const XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
1986    static const XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
1987    // Bounds check
1988    XMVECTOR vResult = _mm_max_ps(V,Min);
1989    vResult = _mm_min_ps(vResult,Max);
1990     // Convert to int with rounding
1991    __m128i vInt = _mm_cvtps_epi32(vResult);
1992    // Pack the ints into shorts
1993    vInt = _mm_packs_epi32(vInt,vInt);
1994    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),_mm_castsi128_pd(vInt));
1995#else // _XM_VMX128_INTRINSICS_
1996#endif // _XM_VMX128_INTRINSICS_
1997}
1998
1999//------------------------------------------------------------------------------
2000_Use_decl_annotations_
2001inline void PackedVector::XMStoreUShortN4
2002(
2003    XMUSHORTN4* pDestination, 
2004    FXMVECTOR V
2005)
2006{
2007    assert(pDestination);
2008#if defined(_XM_NO_INTRINSICS_)
2009
2010    static const XMVECTORF32  Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
2011
2012    XMVECTOR N = XMVectorSaturate(V);
2013    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
2014    N = XMVectorTruncate(N);
2015
2016    XMFLOAT4A tmp;
2017    XMStoreFloat4A(&tmp, N );
2018
2019    pDestination->x = (int16_t)tmp.x;
2020    pDestination->y = (int16_t)tmp.y;
2021    pDestination->z = (int16_t)tmp.z;
2022    pDestination->w = (int16_t)tmp.w;
2023
2024#elif defined(_XM_ARM_NEON_INTRINSICS_)
2025    __n128 vResult = vmaxq_f32( V, g_XMZero );
2026    vResult = vminq_f32( vResult, g_XMOne );
2027    const __n128 Scale = vdupq_n_f32( 65535.0f );
2028    vResult = vmulq_f32( vResult, Scale );
2029    vResult = vcvtq_u32_f32( vResult );
2030    __n64 vInt = vmovn_u32( vResult );
2031    vst1_u16( (uint16_t*)pDestination, vInt );
2032#elif defined(_XM_SSE_INTRINSICS_)
2033    static const XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
2034    // Bounds check
2035    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2036    vResult = _mm_min_ps(vResult,g_XMOne);
2037    vResult = _mm_mul_ps(vResult,Scale);
2038    // Convert to int with rounding
2039    __m128i vInt = _mm_cvtps_epi32(vResult);
2040    // Since the SSE pack instruction clamps using signed rules,
2041    // manually extract the values to store them to memory
2042    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
2043    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
2044    pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
2045    pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
2046#else // _XM_VMX128_INTRINSICS_
2047#endif // _XM_VMX128_INTRINSICS_
2048}
2049
2050//------------------------------------------------------------------------------
2051_Use_decl_annotations_
2052inline void PackedVector::XMStoreUShort4
2053(
2054    XMUSHORT4* pDestination, 
2055    FXMVECTOR V
2056)
2057{
2058    assert(pDestination);
2059#if defined(_XM_NO_INTRINSICS_)
2060
2061    static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
2062
2063    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max);
2064    N = XMVectorRound(N);
2065
2066    XMFLOAT4A tmp;
2067    XMStoreFloat4A(&tmp, N );
2068
2069    pDestination->x = (int16_t)tmp.x;
2070    pDestination->y = (int16_t)tmp.y;
2071    pDestination->z = (int16_t)tmp.z;
2072    pDestination->w = (int16_t)tmp.w;
2073
2074#elif defined(_XM_ARM_NEON_INTRINSICS_)
2075    static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
2076
2077    __n128 vResult = vmaxq_f32( V, g_XMZero );
2078    vResult = vminq_f32( vResult, Max );
2079    vResult = vcvtq_u32_f32( vResult );
2080    __n64 vInt = vmovn_u32( vResult );
2081    vst1_u16( (uint16_t*)pDestination, vInt );
2082#elif defined(_XM_SSE_INTRINSICS_)
2083    static const XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
2084    // Bounds check
2085    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2086    vResult = _mm_min_ps(vResult,Max);
2087     // Convert to int with rounding
2088    __m128i vInt = _mm_cvtps_epi32(vResult);
2089    // Since the SSE pack instruction clamps using signed rules,
2090    // manually extract the values to store them to memory
2091    pDestination->x = static_cast<int16_t>(_mm_extract_epi16(vInt,0));
2092    pDestination->y = static_cast<int16_t>(_mm_extract_epi16(vInt,2));
2093    pDestination->z = static_cast<int16_t>(_mm_extract_epi16(vInt,4));
2094    pDestination->w = static_cast<int16_t>(_mm_extract_epi16(vInt,6));
2095#else // _XM_VMX128_INTRINSICS_
2096#endif // _XM_VMX128_INTRINSICS_
2097}
2098
2099//------------------------------------------------------------------------------
2100_Use_decl_annotations_
2101inline void PackedVector::XMStoreXDecN4
2102(
2103    XMXDECN4* pDestination, 
2104    FXMVECTOR V
2105)
2106{
2107    assert(pDestination);
2108#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2109
2110    static const XMVECTORF32  Min = {-1.0f, -1.0f, -1.0f, 0.0f};
2111    static const XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 3.0f};
2112
2113    XMVECTOR N = XMVectorClamp(V, Min.v, g_XMOne.v);
2114    N = XMVectorMultiply(N, Scale.v);
2115    N = XMVectorRound(N);
2116
2117    XMFLOAT4A tmp;
2118    XMStoreFloat4A(&tmp, N );
2119
2120    pDestination->v = ((uint32_t)tmp.w << 30) |
2121                       (((int32_t)tmp.z & 0x3FF) << 20) |
2122                       (((int32_t)tmp.y & 0x3FF) << 10) |
2123                       (((int32_t)tmp.x & 0x3FF));
2124
2125#elif defined(_XM_SSE_INTRINSICS_)
2126    static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
2127    static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
2128    static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
2129    XMVECTOR vResult = _mm_max_ps(V,Min);
2130    vResult = _mm_min_ps(vResult,g_XMOne);
2131    // Scale by multiplication
2132    vResult = _mm_mul_ps(vResult,Scale);
2133    // Convert to int (W is unsigned)
2134    __m128i vResulti = _mm_cvtps_epi32(vResult);
2135    // Mask off any fraction
2136    vResulti = _mm_and_si128(vResulti,ScaleMask);
2137    // To fix W, add itself to shift it up to <<30 instead of <<29
2138    __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
2139    vResulti = _mm_add_epi32(vResulti,vResultw);
2140    // Do a horizontal or of all 4 entries
2141    vResult = XM_PERMUTE_PS(_mm_castsi128_ps(vResulti),_MM_SHUFFLE(0,3,2,1));
2142    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
2143    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
2144    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
2145    vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,3,2,1));
2146    vResulti = _mm_or_si128(vResulti,_mm_castps_si128(vResult));
2147    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2148#else // _XM_VMX128_INTRINSICS_
2149#endif // _XM_VMX128_INTRINSICS_
2150}
2151
2152//------------------------------------------------------------------------------
2153_Use_decl_annotations_
2154inline void PackedVector::XMStoreXDec4
2155(
2156    XMXDEC4* pDestination, 
2157    FXMVECTOR V
2158)
2159{
2160    assert(pDestination);
2161#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2162
2163    static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, 0.0f};
2164    static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 3.0f};
2165
2166    XMVECTOR N = XMVectorClamp(V, Min, Max);
2167
2168    XMFLOAT4A tmp;
2169    XMStoreFloat4A(&tmp, N );
2170
2171    pDestination->v = ((uint32_t)tmp.w << 30) |
2172                       (((int32_t)tmp.z & 0x3FF) << 20) |
2173                       (((int32_t)tmp.y & 0x3FF) << 10) |
2174                       (((int32_t)tmp.x & 0x3FF));
2175
2176#elif defined(_XM_SSE_INTRINSICS_)
2177    static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
2178    static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
2179    static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
2180    static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
2181    // Clamp to bounds
2182    XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
2183    vResult = _mm_min_ps(vResult,MaxXDec4);
2184    // Scale by multiplication
2185    vResult = _mm_mul_ps(vResult,ScaleXDec4);
2186    // Convert to int
2187    __m128i vResulti = _mm_cvttps_epi32(vResult);
2188    // Mask off any fraction
2189    vResulti = _mm_and_si128(vResulti,MaskXDec4);
2190    // Do a horizontal or of 4 entries
2191    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2192    // x = x|z, y = y|w
2193    vResulti = _mm_or_si128(vResulti,vResulti2);
2194    // Move Z to the x position
2195    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2196    // Perform a single bit left shift on y|w
2197    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
2198    // i = x|y|z|w
2199    vResulti = _mm_or_si128(vResulti,vResulti2);
2200    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2201#else // _XM_VMX128_INTRINSICS_
2202#endif // _XM_VMX128_INTRINSICS_
2203}
2204
2205//------------------------------------------------------------------------------
2206_Use_decl_annotations_
2207inline void PackedVector::XMStoreUDecN4
2208(
2209    XMUDECN4* pDestination, 
2210    FXMVECTOR V
2211)
2212{
2213    assert(pDestination);
2214#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2215
2216    static const XMVECTORF32  Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
2217
2218    XMVECTOR N = XMVectorSaturate(V);
2219    N = XMVectorMultiply(N, Scale.v);
2220
2221    XMFLOAT4A tmp;
2222    XMStoreFloat4A(&tmp, N );
2223
2224    pDestination->v = ((uint32_t)tmp.w << 30) |
2225                       (((uint32_t)tmp.z & 0x3FF) << 20) |
2226                       (((uint32_t)tmp.y & 0x3FF) << 10) |
2227                       (((uint32_t)tmp.x & 0x3FF));
2228
2229#elif defined(_XM_SSE_INTRINSICS_)
2230    static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
2231    static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
2232    // Clamp to bounds
2233    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2234    vResult = _mm_min_ps(vResult,g_XMOne);
2235    // Scale by multiplication
2236    vResult = _mm_mul_ps(vResult,ScaleUDecN4);
2237    // Convert to int
2238    __m128i vResulti = _mm_cvttps_epi32(vResult);
2239    // Mask off any fraction
2240    vResulti = _mm_and_si128(vResulti,MaskUDecN4);
2241    // Do a horizontal or of 4 entries
2242    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2243    // x = x|z, y = y|w
2244    vResulti = _mm_or_si128(vResulti,vResulti2);
2245    // Move Z to the x position
2246    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2247    // Perform a left shift by one bit on y|w
2248    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
2249    // i = x|y|z|w
2250    vResulti = _mm_or_si128(vResulti,vResulti2);
2251    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2252#else // _XM_VMX128_INTRINSICS_
2253#endif // _XM_VMX128_INTRINSICS_
2254}
2255
2256//------------------------------------------------------------------------------
2257_Use_decl_annotations_
2258inline void PackedVector::XMStoreUDec4
2259(
2260    XMUDEC4* pDestination, 
2261    FXMVECTOR V
2262)
2263{
2264    assert(pDestination);
2265#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2266
2267    static const XMVECTORF32 Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
2268
2269    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max);
2270
2271    XMFLOAT4A tmp;
2272    XMStoreFloat4A(&tmp, N );
2273
2274    pDestination->v = ((uint32_t)tmp.w << 30) |
2275                       (((uint32_t)tmp.z & 0x3FF) << 20) |
2276                       (((uint32_t)tmp.y & 0x3FF) << 10) |
2277                       (((uint32_t)tmp.x & 0x3FF));
2278
2279#elif defined(_XM_SSE_INTRINSICS_)
2280    static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
2281    static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
2282    static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
2283    // Clamp to bounds
2284    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2285    vResult = _mm_min_ps(vResult,MaxUDec4);
2286    // Scale by multiplication
2287    vResult = _mm_mul_ps(vResult,ScaleUDec4);
2288    // Convert to int
2289    __m128i vResulti = _mm_cvttps_epi32(vResult);
2290    // Mask off any fraction
2291    vResulti = _mm_and_si128(vResulti,MaskUDec4);
2292    // Do a horizontal or of 4 entries
2293    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2294    // x = x|z, y = y|w
2295    vResulti = _mm_or_si128(vResulti,vResulti2);
2296    // Move Z to the x position
2297    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2298    // Perform a left shift by one bit on y|w
2299    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
2300    // i = x|y|z|w
2301    vResulti = _mm_or_si128(vResulti,vResulti2);
2302    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2303#else // _XM_VMX128_INTRINSICS_
2304#endif // _XM_VMX128_INTRINSICS_
2305}
2306
2307//------------------------------------------------------------------------------
2308_Use_decl_annotations_
2309inline void PackedVector::XMStoreDecN4
2310(
2311    XMDECN4* pDestination, 
2312    FXMVECTOR V
2313)
2314{
2315    assert(pDestination);
2316#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2317
2318    static const XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 1.0f};
2319
2320    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
2321    N = XMVectorMultiply(N, Scale.v);
2322
2323    XMFLOAT4A tmp;
2324    XMStoreFloat4A(&tmp, N );
2325
2326    pDestination->v = ((int32_t)tmp.w << 30) |
2327                       (((int32_t)tmp.z & 0x3FF) << 20) |
2328                       (((int32_t)tmp.y & 0x3FF) << 10) |
2329                       (((int32_t)tmp.x & 0x3FF));
2330
2331#elif defined(_XM_SSE_INTRINSICS_)
2332    static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
2333    static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
2334    // Clamp to bounds
2335    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
2336    vResult = _mm_min_ps(vResult,g_XMOne);
2337    // Scale by multiplication
2338    vResult = _mm_mul_ps(vResult,ScaleDecN4);
2339    // Convert to int
2340    __m128i vResulti = _mm_cvttps_epi32(vResult);
2341    // Mask off any fraction
2342    vResulti = _mm_and_si128(vResulti,MaskDecN4);
2343    // Do a horizontal or of 4 entries
2344    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2345    // x = x|z, y = y|w
2346    vResulti = _mm_or_si128(vResulti,vResulti2);
2347    // Move Z to the x position
2348    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2349    // i = x|y|z|w
2350    vResulti = _mm_or_si128(vResulti,vResulti2);
2351    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2352#else // _XM_VMX128_INTRINSICS_
2353#endif // _XM_VMX128_INTRINSICS_
2354}
2355
2356//------------------------------------------------------------------------------
2357_Use_decl_annotations_
2358inline void PackedVector::XMStoreDec4
2359(
2360    XMDEC4*  pDestination, 
2361    FXMVECTOR V
2362)
2363{
2364    assert(pDestination);
2365#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2366
2367    static const XMVECTORF32 Min = {-511.0f, -511.0f, -511.0f, -1.0f};
2368    static const XMVECTORF32 Max = {511.0f, 511.0f, 511.0f, 1.0f};
2369
2370    XMVECTOR N = XMVectorClamp(V, Min, Max);
2371
2372    XMFLOAT4A tmp;
2373    XMStoreFloat4A(&tmp, N );
2374
2375    pDestination->v = ((int32_t)tmp.w << 30) |
2376                       (((int32_t)tmp.z & 0x3FF) << 20) |
2377                       (((int32_t)tmp.y & 0x3FF) << 10) |
2378                       (((int32_t)tmp.x & 0x3FF));
2379
2380#elif defined(_XM_SSE_INTRINSICS_)
2381    static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
2382    static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
2383    static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
2384    static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
2385    // Clamp to bounds
2386    XMVECTOR vResult = _mm_max_ps(V,MinDec4);
2387    vResult = _mm_min_ps(vResult,MaxDec4);
2388    // Scale by multiplication
2389    vResult = _mm_mul_ps(vResult,ScaleDec4);
2390    // Convert to int
2391    __m128i vResulti = _mm_cvttps_epi32(vResult);
2392    // Mask off any fraction
2393    vResulti = _mm_and_si128(vResulti,MaskDec4);
2394    // Do a horizontal or of 4 entries
2395    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2396    // x = x|z, y = y|w
2397    vResulti = _mm_or_si128(vResulti,vResulti2);
2398    // Move Z to the x position
2399    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2400    // i = x|y|z|w
2401    vResulti = _mm_or_si128(vResulti,vResulti2);
2402    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2403#else // _XM_VMX128_INTRINSICS_
2404#endif // _XM_VMX128_INTRINSICS_
2405}
2406
2407//------------------------------------------------------------------------------
2408_Use_decl_annotations_
2409inline void PackedVector::XMStoreUByteN4
2410(
2411    XMUBYTEN4* pDestination, 
2412    FXMVECTOR V
2413)
2414{
2415    assert(pDestination);
2416#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2417
2418    static const XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
2419
2420    XMVECTOR N = XMVectorSaturate(V);
2421    N = XMVectorMultiply(N, Scale.v);
2422    N = XMVectorRound(N);
2423
2424    XMFLOAT4A tmp;
2425    XMStoreFloat4A(&tmp, N );
2426
2427    pDestination->x = (uint8_t)tmp.x;
2428    pDestination->y = (uint8_t)tmp.y;
2429    pDestination->z = (uint8_t)tmp.z;
2430    pDestination->w = (uint8_t)tmp.w;
2431
2432#elif defined(_XM_SSE_INTRINSICS_)
2433    static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
2434    static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
2435    // Clamp to bounds
2436    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2437    vResult = _mm_min_ps(vResult,g_XMOne);
2438    // Scale by multiplication
2439    vResult = _mm_mul_ps(vResult,ScaleUByteN4);
2440    // Convert to int
2441    __m128i vResulti = _mm_cvttps_epi32(vResult);
2442    // Mask off any fraction
2443    vResulti = _mm_and_si128(vResulti,MaskUByteN4);
2444    // Do a horizontal or of 4 entries
2445    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2446    // x = x|z, y = y|w
2447    vResulti = _mm_or_si128(vResulti,vResulti2);
2448    // Move Z to the x position
2449    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2450    // Perform a single bit left shift to fix y|w 
2451    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
2452    // i = x|y|z|w
2453    vResulti = _mm_or_si128(vResulti,vResulti2);
2454    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2455#else // _XM_VMX128_INTRINSICS_
2456#endif // _XM_VMX128_INTRINSICS_
2457}
2458
2459//------------------------------------------------------------------------------
2460_Use_decl_annotations_
2461inline void PackedVector::XMStoreUByte4
2462(
2463    XMUBYTE4* pDestination, 
2464    FXMVECTOR V
2465)
2466{
2467    assert(pDestination);
2468#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2469
2470    static const XMVECTORF32 Max = {255.0f, 255.0f, 255.0f, 255.0f};
2471
2472    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max);
2473    N = XMVectorRound(N);
2474
2475    XMFLOAT4A tmp;
2476    XMStoreFloat4A(&tmp, N );
2477
2478    pDestination->x = (uint8_t)tmp.x;
2479    pDestination->y = (uint8_t)tmp.y;
2480    pDestination->z = (uint8_t)tmp.z;
2481    pDestination->w = (uint8_t)tmp.w;
2482
2483#elif defined(_XM_SSE_INTRINSICS_)
2484    static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f};
2485    static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
2486    static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
2487    // Clamp to bounds
2488    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2489    vResult = _mm_min_ps(vResult,MaxUByte4);
2490    // Scale by multiplication
2491    vResult = _mm_mul_ps(vResult,ScaleUByte4);
2492    // Convert to int
2493    __m128i vResulti = _mm_cvttps_epi32(vResult);
2494    // Mask off any fraction
2495    vResulti = _mm_and_si128(vResulti,MaskUByte4);
2496    // Do a horizontal or of 4 entries
2497    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2498    // x = x|z, y = y|w
2499    vResulti = _mm_or_si128(vResulti,vResulti2);
2500    // Move Z to the x position
2501    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2502    // Perform a single bit left shift to fix y|w 
2503    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
2504    // i = x|y|z|w
2505    vResulti = _mm_or_si128(vResulti,vResulti2);
2506    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2507#else // _XM_VMX128_INTRINSICS_
2508#endif // _XM_VMX128_INTRINSICS_
2509}
2510
2511//------------------------------------------------------------------------------
2512_Use_decl_annotations_
2513inline void PackedVector::XMStoreByteN4
2514(
2515    XMBYTEN4* pDestination, 
2516    FXMVECTOR V
2517)
2518{
2519    assert(pDestination);
2520#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2521
2522    static const XMVECTORF32  Scale = {127.0f, 127.0f, 127.0f, 127.0f};
2523
2524    XMVECTOR N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
2525    N = XMVectorMultiply(V, Scale.v);
2526    N = XMVectorRound(N);
2527
2528    XMFLOAT4A tmp;
2529    XMStoreFloat4A(&tmp, N );
2530
2531    pDestination->x = (int8_t)tmp.x;
2532    pDestination->y = (int8_t)tmp.y;
2533    pDestination->z = (int8_t)tmp.z;
2534    pDestination->w = (int8_t)tmp.w;
2535
2536#elif defined(_XM_SSE_INTRINSICS_)
2537    static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
2538    static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
2539    // Clamp to bounds
2540    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
2541    vResult = _mm_min_ps(vResult,g_XMOne);
2542    // Scale by multiplication
2543    vResult = _mm_mul_ps(vResult,ScaleByteN4);
2544    // Convert to int
2545    __m128i vResulti = _mm_cvttps_epi32(vResult);
2546    // Mask off any fraction
2547    vResulti = _mm_and_si128(vResulti,MaskByteN4);
2548    // Do a horizontal or of 4 entries
2549    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2550    // x = x|z, y = y|w
2551    vResulti = _mm_or_si128(vResulti,vResulti2);
2552    // Move Z to the x position
2553    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2554    // i = x|y|z|w
2555    vResulti = _mm_or_si128(vResulti,vResulti2);
2556    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2557#else // _XM_VMX128_INTRINSICS_
2558#endif // _XM_VMX128_INTRINSICS_
2559}
2560
2561//------------------------------------------------------------------------------
2562_Use_decl_annotations_
2563inline void PackedVector::XMStoreByte4
2564(
2565    XMBYTE4*  pDestination, 
2566    FXMVECTOR V
2567)
2568{
2569    assert(pDestination);
2570#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
2571
2572    static const XMVECTORF32 Min = {-127.0f, -127.0f, -127.0f, -127.0f};
2573    static const XMVECTORF32 Max = {127.0f, 127.0f, 127.0f, 127.0f};
2574
2575    XMVECTOR N = XMVectorClamp(V, Min, Max);
2576    N = XMVectorRound(N);
2577
2578    XMFLOAT4A tmp;
2579    XMStoreFloat4A(&tmp, N );
2580
2581    pDestination->x = (int8_t)tmp.x;
2582    pDestination->y = (int8_t)tmp.y;
2583    pDestination->z = (int8_t)tmp.z;
2584    pDestination->w = (int8_t)tmp.w;
2585
2586#elif defined(_XM_SSE_INTRINSICS_)
2587    static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f};
2588    static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f};
2589    static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
2590    static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
2591    // Clamp to bounds
2592    XMVECTOR vResult = _mm_max_ps(V,MinByte4);
2593    vResult = _mm_min_ps(vResult,MaxByte4);
2594    // Scale by multiplication
2595    vResult = _mm_mul_ps(vResult,ScaleByte4);
2596    // Convert to int
2597    __m128i vResulti = _mm_cvttps_epi32(vResult);
2598    // Mask off any fraction
2599    vResulti = _mm_and_si128(vResulti,MaskByte4);
2600    // Do a horizontal or of 4 entries
2601    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
2602    // x = x|z, y = y|w
2603    vResulti = _mm_or_si128(vResulti,vResulti2);
2604    // Move Z to the x position
2605    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
2606    // i = x|y|z|w
2607    vResulti = _mm_or_si128(vResulti,vResulti2);
2608    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),_mm_castsi128_ps(vResulti));
2609#else // _XM_VMX128_INTRINSICS_
2610#endif // _XM_VMX128_INTRINSICS_
2611}
2612
2613//------------------------------------------------------------------------------
2614_Use_decl_annotations_
2615inline void PackedVector::XMStoreUNibble4
2616(
2617     XMUNIBBLE4* pDestination,
2618     FXMVECTOR V
2619)
2620{
2621    assert(pDestination);
2622#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2623    static const XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
2624    // Bounds check
2625    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2626    vResult = _mm_min_ps(vResult,Max);
2627     // Convert to int with rounding
2628    __m128i vInt = _mm_cvtps_epi32(vResult);
2629    // No SSE operations will write to 16-bit values, so we have to extract them manually
2630    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
2631    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
2632    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
2633    uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
2634    pDestination->v = ((w & 0xF) << 12) |
2635                      ((z & 0xF) << 8) |
2636                      ((y & 0xF) << 4) |
2637                      ((x & 0xF));
2638#else
2639    static const XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
2640
2641    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
2642    N = XMVectorRound(N);
2643
2644    XMFLOAT4A tmp;
2645    XMStoreFloat4A(&tmp, N );
2646
2647    pDestination->v = (((uint16_t)tmp.w & 0xF) << 12) |
2648                      (((uint16_t)tmp.z & 0xF) << 8) |
2649                      (((uint16_t)tmp.y & 0xF) << 4) |
2650                      (((uint16_t)tmp.x & 0xF));
2651#endif !_XM_SSE_INTRINSICS_
2652}
2653
2654//------------------------------------------------------------------------------
2655_Use_decl_annotations_
2656inline void PackedVector::XMStoreU555
2657(
2658     XMU555* pDestination,
2659     FXMVECTOR V
2660)
2661{
2662    assert(pDestination);
2663#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2664    static const XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
2665    // Bounds check
2666    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2667    vResult = _mm_min_ps(vResult,Max);
2668     // Convert to int with rounding
2669    __m128i vInt = _mm_cvtps_epi32(vResult);
2670    // No SSE operations will write to 16-bit values, so we have to extract them manually
2671    uint16_t x = static_cast<uint16_t>(_mm_extract_epi16(vInt,0));
2672    uint16_t y = static_cast<uint16_t>(_mm_extract_epi16(vInt,2));
2673    uint16_t z = static_cast<uint16_t>(_mm_extract_epi16(vInt,4));
2674    uint16_t w = static_cast<uint16_t>(_mm_extract_epi16(vInt,6));
2675    pDestination->v = ((w) ? 0x8000 : 0) |
2676                      ((z & 0x1F) << 10) |
2677                      ((y & 0x1F) << 5) |
2678                      ((x & 0x1F));
2679#else
2680    static const XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
2681
2682    XMVECTOR N = XMVectorClamp(V, XMVectorZero(), Max.v);
2683    N = XMVectorRound(N);
2684
2685    XMFLOAT4A tmp;
2686    XMStoreFloat4A(&tmp, N );
2687
2688    pDestination->v = ((tmp.w > 0.f) ? 0x8000 : 0) |
2689                      (((uint16_t)tmp.z & 0x1F) << 10) |
2690                      (((uint16_t)tmp.y & 0x1F) << 5) |
2691                      (((uint16_t)tmp.x & 0x1F));
2692#endif !_XM_SSE_INTRINSICS_
2693}
2694
2695
2696/****************************************************************************
2697 *
2698 * XMCOLOR operators
2699 *
2700 ****************************************************************************/
2701
2702//------------------------------------------------------------------------------
2703
2704inline PackedVector::XMCOLOR::XMCOLOR
2705(
2706    float _r,
2707    float _g,
2708    float _b,
2709    float _a
2710)
2711{
2712    XMStoreColor(this, XMVectorSet(_r, _g, _b, _a));
2713}
2714
2715//------------------------------------------------------------------------------
2716_Use_decl_annotations_
2717inline PackedVector::XMCOLOR::XMCOLOR
2718(
2719    const float* pArray
2720)
2721{
2722    XMStoreColor(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
2723}
2724
2725/****************************************************************************
2726 *
2727 * XMHALF2 operators
2728 *
2729 ****************************************************************************/
2730
2731//------------------------------------------------------------------------------
2732
2733inline PackedVector::XMHALF2::XMHALF2
2734(
2735    float _x,
2736    float _y
2737)
2738{
2739    x = XMConvertFloatToHalf(_x);
2740    y = XMConvertFloatToHalf(_y);
2741}
2742
2743//------------------------------------------------------------------------------
2744_Use_decl_annotations_
2745inline PackedVector::XMHALF2::XMHALF2
2746(
2747    const float* pArray
2748)
2749{
2750    assert( pArray != nullptr );
2751    x = XMConvertFloatToHalf(pArray[0]);
2752    y = XMConvertFloatToHalf(pArray[1]);
2753}
2754
2755/****************************************************************************
2756 *
2757 * XMSHORTN2 operators
2758 *
2759 ****************************************************************************/
2760
2761//------------------------------------------------------------------------------
2762
2763inline PackedVector::XMSHORTN2::XMSHORTN2
2764(
2765    float _x,
2766    float _y
2767)
2768{
2769    XMStoreShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2770}
2771
2772//------------------------------------------------------------------------------
2773_Use_decl_annotations_
2774inline PackedVector::XMSHORTN2::XMSHORTN2
2775(
2776    const float* pArray
2777)
2778{
2779    XMStoreShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2780}
2781
2782/****************************************************************************
2783 *
2784 * XMSHORT2 operators
2785 *
2786 ****************************************************************************/
2787
2788//------------------------------------------------------------------------------
2789
2790inline PackedVector::XMSHORT2::XMSHORT2
2791(
2792    float _x,
2793    float _y
2794)
2795{
2796    XMStoreShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2797}
2798
2799//------------------------------------------------------------------------------
2800_Use_decl_annotations_
2801inline PackedVector::XMSHORT2::XMSHORT2
2802(
2803    const float* pArray
2804)
2805{
2806    XMStoreShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2807}
2808
2809/****************************************************************************
2810 *
2811 * XMUSHORTN2 operators
2812 *
2813 ****************************************************************************/
2814
2815//------------------------------------------------------------------------------
2816
2817inline PackedVector::XMUSHORTN2::XMUSHORTN2
2818(
2819    float _x,
2820    float _y
2821)
2822{
2823    XMStoreUShortN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2824}
2825
2826//------------------------------------------------------------------------------
2827_Use_decl_annotations_
2828inline PackedVector::XMUSHORTN2::XMUSHORTN2
2829(
2830    const float* pArray
2831)
2832{
2833    XMStoreUShortN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2834}
2835
2836/****************************************************************************
2837 *
2838 * XMUSHORT2 operators
2839 *
2840 ****************************************************************************/
2841
2842//------------------------------------------------------------------------------
2843
2844inline PackedVector::XMUSHORT2::XMUSHORT2
2845(
2846    float _x,
2847    float _y
2848)
2849{
2850    XMStoreUShort2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2851}
2852
2853//------------------------------------------------------------------------------
2854_Use_decl_annotations_
2855inline PackedVector::XMUSHORT2::XMUSHORT2
2856(
2857    const float* pArray
2858)
2859{
2860    XMStoreUShort2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2861}
2862
2863/****************************************************************************
2864 *
2865 * XMBYTEN2 operators
2866 *
2867 ****************************************************************************/
2868
2869//------------------------------------------------------------------------------
2870
2871inline PackedVector::XMBYTEN2::XMBYTEN2
2872(
2873    float _x,
2874    float _y
2875)
2876{
2877    XMStoreByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2878}
2879
2880//------------------------------------------------------------------------------
2881_Use_decl_annotations_
2882inline PackedVector::XMBYTEN2::XMBYTEN2
2883(
2884    const float* pArray
2885)
2886{
2887    XMStoreByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2888}
2889
2890/****************************************************************************
2891 *
2892 * XMBYTE2 operators
2893 *
2894 ****************************************************************************/
2895
2896//------------------------------------------------------------------------------
2897
2898inline PackedVector::XMBYTE2::XMBYTE2
2899(
2900    float _x,
2901    float _y
2902)
2903{
2904    XMStoreByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2905}
2906
2907//------------------------------------------------------------------------------
2908_Use_decl_annotations_
2909inline PackedVector::XMBYTE2::XMBYTE2
2910(
2911    const float* pArray
2912)
2913{
2914    XMStoreByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2915}
2916
2917/****************************************************************************
2918 *
2919 * XMUBYTEN2 operators
2920 *
2921 ****************************************************************************/
2922
2923//------------------------------------------------------------------------------
2924
2925inline PackedVector::XMUBYTEN2::XMUBYTEN2
2926(
2927    float _x,
2928    float _y
2929)
2930{
2931    XMStoreUByteN2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2932}
2933
2934//------------------------------------------------------------------------------
2935_Use_decl_annotations_
2936inline PackedVector::XMUBYTEN2::XMUBYTEN2
2937(
2938    const float* pArray
2939)
2940{
2941    XMStoreUByteN2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2942}
2943
2944/****************************************************************************
2945 *
2946 * XMUBYTE2 operators
2947 *
2948 ****************************************************************************/
2949
2950//------------------------------------------------------------------------------
2951
2952inline PackedVector::XMUBYTE2::XMUBYTE2
2953(
2954    float _x,
2955    float _y
2956)
2957{
2958    XMStoreUByte2(this, XMVectorSet(_x, _y, 0.0f, 0.0f));
2959}
2960
2961//------------------------------------------------------------------------------
2962_Use_decl_annotations_
2963inline PackedVector::XMUBYTE2::XMUBYTE2
2964(
2965    const float* pArray
2966)
2967{
2968    XMStoreUByte2(this, XMLoadFloat2(reinterpret_cast<const XMFLOAT2*>(pArray)));
2969}
2970
2971/****************************************************************************
2972 *
2973 * XMU565 operators
2974 *
2975 ****************************************************************************/
2976
2977inline PackedVector::XMU565::XMU565
2978(
2979    float _x,
2980    float _y,
2981    float _z
2982)
2983{
2984    XMStoreU565(this, XMVectorSet( _x, _y, _z, 0.0f ));
2985}
2986
2987_Use_decl_annotations_
2988inline PackedVector::XMU565::XMU565
2989(
2990    const float *pArray
2991)
2992{
2993    XMStoreU565(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
2994}
2995
2996/****************************************************************************
2997 *
2998 * XMFLOAT3PK operators
2999 *
3000 ****************************************************************************/
3001
3002inline PackedVector::XMFLOAT3PK::XMFLOAT3PK
3003(
3004    float _x,
3005    float _y,
3006    float _z
3007)
3008{
3009    XMStoreFloat3PK(this, XMVectorSet( _x, _y, _z, 0.0f ));
3010}
3011
3012_Use_decl_annotations_
3013inline PackedVector::XMFLOAT3PK::XMFLOAT3PK
3014(
3015    const float *pArray
3016)
3017{
3018    XMStoreFloat3PK(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
3019}
3020
3021/****************************************************************************
3022 *
3023 * XMFLOAT3SE operators
3024 *
3025 ****************************************************************************/
3026
3027inline PackedVector::XMFLOAT3SE::XMFLOAT3SE
3028(
3029    float _x,
3030    float _y,
3031    float _z
3032)
3033{
3034    XMStoreFloat3SE(this, XMVectorSet( _x, _y, _z, 0.0f ));
3035}
3036
3037_Use_decl_annotations_
3038inline PackedVector::XMFLOAT3SE::XMFLOAT3SE
3039(
3040    const float *pArray
3041)
3042{
3043    XMStoreFloat3SE(this, XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray)));
3044}
3045
3046/****************************************************************************
3047 *
3048 * XMHALF4 operators
3049 *
3050 ****************************************************************************/
3051
3052//------------------------------------------------------------------------------
3053
3054inline PackedVector::XMHALF4::XMHALF4
3055(
3056    float _x,
3057    float _y,
3058    float _z,
3059    float _w
3060)
3061{
3062    x = XMConvertFloatToHalf(_x);
3063    y = XMConvertFloatToHalf(_y);
3064    z = XMConvertFloatToHalf(_z);
3065    w = XMConvertFloatToHalf(_w);
3066}
3067
3068//------------------------------------------------------------------------------
3069
3070_Use_decl_annotations_
3071inline PackedVector::XMHALF4::XMHALF4
3072(
3073    const float* pArray
3074)
3075{
3076    XMConvertFloatToHalfStream(&x, sizeof(HALF), pArray, sizeof(float), 4);
3077}
3078
3079/****************************************************************************
3080 *
3081 * XMSHORTN4 operators
3082 *
3083 ****************************************************************************/
3084
3085//------------------------------------------------------------------------------
3086
3087inline PackedVector::XMSHORTN4::XMSHORTN4
3088(
3089    float _x,
3090    float _y,
3091    float _z,
3092    float _w
3093)
3094{
3095    XMStoreShortN4(this, XMVectorSet(_x, _y, _z, _w));
3096}
3097
3098//------------------------------------------------------------------------------
3099_Use_decl_annotations_
3100inline PackedVector::XMSHORTN4::XMSHORTN4
3101(
3102    const float* pArray
3103)
3104{
3105    XMStoreShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3106}
3107
3108/****************************************************************************
3109 *
3110 * XMSHORT4 operators
3111 *
3112 ****************************************************************************/
3113
3114//------------------------------------------------------------------------------
3115
3116inline PackedVector::XMSHORT4::XMSHORT4
3117(
3118    float _x,
3119    float _y,
3120    float _z,
3121    float _w
3122)
3123{
3124    XMStoreShort4(this, XMVectorSet(_x, _y, _z, _w));
3125}
3126
3127//------------------------------------------------------------------------------
3128_Use_decl_annotations_
3129inline PackedVector::XMSHORT4::XMSHORT4
3130(
3131    const float* pArray
3132)
3133{
3134    XMStoreShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3135}
3136
3137/****************************************************************************
3138 *
3139 * XMUSHORTN4 operators
3140 *
3141 ****************************************************************************/
3142
3143//------------------------------------------------------------------------------
3144
3145inline PackedVector::XMUSHORTN4::XMUSHORTN4
3146(
3147    float _x,
3148    float _y,
3149    float _z,
3150    float _w
3151)
3152{
3153    XMStoreUShortN4(this, XMVectorSet(_x, _y, _z, _w));
3154}
3155
3156//------------------------------------------------------------------------------
3157_Use_decl_annotations_
3158inline PackedVector::XMUSHORTN4::XMUSHORTN4
3159(
3160    const float* pArray
3161)
3162{
3163    XMStoreUShortN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3164}
3165
3166/****************************************************************************
3167 *
3168 * XMUSHORT4 operators
3169 *
3170 ****************************************************************************/
3171
3172//------------------------------------------------------------------------------
3173
3174inline PackedVector::XMUSHORT4::XMUSHORT4
3175(
3176    float _x,
3177    float _y,
3178    float _z,
3179    float _w
3180)
3181{
3182    XMStoreUShort4(this, XMVectorSet(_x, _y, _z, _w));
3183}
3184
3185//------------------------------------------------------------------------------
3186_Use_decl_annotations_
3187inline PackedVector::XMUSHORT4::XMUSHORT4
3188(
3189    const float* pArray
3190)
3191{
3192    XMStoreUShort4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3193}
3194
3195/****************************************************************************
3196 *
3197 * XMXDECN4 operators
3198 *
3199 ****************************************************************************/
3200
3201//------------------------------------------------------------------------------
3202
3203inline PackedVector::XMXDECN4::XMXDECN4
3204(
3205    float _x,
3206    float _y,
3207    float _z,
3208    float _w
3209)
3210{
3211    XMStoreXDecN4(this, XMVectorSet(_x, _y, _z, _w));
3212}
3213
3214//------------------------------------------------------------------------------
3215_Use_decl_annotations_
3216inline PackedVector::XMXDECN4::XMXDECN4
3217(
3218    const float* pArray
3219)
3220{
3221    XMStoreXDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3222}
3223
3224/****************************************************************************
3225 *
3226 * XMXDEC4 operators
3227 *
3228 ****************************************************************************/
3229
3230//------------------------------------------------------------------------------
3231
3232inline PackedVector::XMXDEC4::XMXDEC4
3233(
3234    float _x,
3235    float _y,
3236    float _z,
3237    float _w
3238)
3239{
3240    XMStoreXDec4(this, XMVectorSet(_x, _y, _z, _w));
3241}
3242
3243//------------------------------------------------------------------------------
3244_Use_decl_annotations_
3245inline PackedVector::XMXDEC4::XMXDEC4
3246(
3247    const float* pArray
3248)
3249{
3250    XMStoreXDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3251}
3252
3253/****************************************************************************
3254 *
3255 * XMDECN4 operators
3256 *
3257 ****************************************************************************/
3258
3259//------------------------------------------------------------------------------
3260
3261inline PackedVector::XMDECN4::XMDECN4
3262(
3263    float _x,
3264    float _y,
3265    float _z,
3266    float _w
3267)
3268{
3269    XMStoreDecN4(this, XMVectorSet(_x, _y, _z, _w));
3270}
3271
3272//------------------------------------------------------------------------------
3273_Use_decl_annotations_
3274inline PackedVector::XMDECN4::XMDECN4
3275(
3276    const float* pArray
3277)
3278{
3279    XMStoreDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3280}
3281
3282/****************************************************************************
3283 *
3284 * XMDEC4 operators
3285 *
3286 ****************************************************************************/
3287
3288//------------------------------------------------------------------------------
3289
3290inline PackedVector::XMDEC4::XMDEC4
3291(
3292    float _x,
3293    float _y,
3294    float _z,
3295    float _w
3296)
3297{
3298    XMStoreDec4(this, XMVectorSet(_x, _y, _z, _w));
3299}
3300
3301//------------------------------------------------------------------------------
3302_Use_decl_annotations_
3303inline PackedVector::XMDEC4::XMDEC4
3304(
3305    const float* pArray
3306)
3307{
3308    XMStoreDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3309}
3310
3311/****************************************************************************
3312 *
3313 * XMUDECN4 operators
3314 *
3315 ****************************************************************************/
3316
3317//------------------------------------------------------------------------------
3318
3319inline PackedVector::XMUDECN4::XMUDECN4
3320(
3321    float _x,
3322    float _y,
3323    float _z,
3324    float _w
3325)
3326{
3327    XMStoreUDecN4(this, XMVectorSet(_x, _y, _z, _w));
3328}
3329
3330//------------------------------------------------------------------------------
3331_Use_decl_annotations_
3332inline PackedVector::XMUDECN4::XMUDECN4
3333(
3334    const float* pArray
3335)
3336{
3337    XMStoreUDecN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3338}
3339
3340/****************************************************************************
3341 *
3342 * XMUDEC4 operators
3343 *
3344 ****************************************************************************/
3345
3346//------------------------------------------------------------------------------
3347
3348inline PackedVector::XMUDEC4::XMUDEC4
3349(
3350    float _x,
3351    float _y,
3352    float _z,
3353    float _w
3354)
3355{
3356    XMStoreUDec4(this, XMVectorSet(_x, _y, _z, _w));
3357}
3358
3359//------------------------------------------------------------------------------
3360_Use_decl_annotations_
3361inline PackedVector::XMUDEC4::XMUDEC4
3362(
3363    const float* pArray
3364)
3365{
3366    XMStoreUDec4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3367}
3368
3369/****************************************************************************
3370 *
3371 * XMBYTEN4 operators
3372 *
3373 ****************************************************************************/
3374
3375//------------------------------------------------------------------------------
3376
3377inline PackedVector::XMBYTEN4::XMBYTEN4
3378(
3379    float _x,
3380    float _y,
3381    float _z,
3382    float _w
3383)
3384{
3385    XMStoreByteN4(this, XMVectorSet(_x, _y, _z, _w));
3386}
3387
3388//------------------------------------------------------------------------------
3389_Use_decl_annotations_
3390inline PackedVector::XMBYTEN4::XMBYTEN4
3391(
3392    const float* pArray
3393)
3394{
3395    XMStoreByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3396}
3397
3398/****************************************************************************
3399 *
3400 * XMBYTE4 operators
3401 *
3402 ****************************************************************************/
3403
3404//------------------------------------------------------------------------------
3405
3406inline PackedVector::XMBYTE4::XMBYTE4
3407(
3408    float _x,
3409    float _y,
3410    float _z,
3411    float _w
3412)
3413{
3414    XMStoreByte4(this, XMVectorSet(_x, _y, _z, _w));
3415}
3416
3417//------------------------------------------------------------------------------
3418_Use_decl_annotations_
3419inline PackedVector::XMBYTE4::XMBYTE4
3420(
3421    const float* pArray
3422)
3423{
3424    XMStoreByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3425}
3426
3427/****************************************************************************
3428 *
3429 * XMUBYTEN4 operators
3430 *
3431 ****************************************************************************/
3432
3433//------------------------------------------------------------------------------
3434
3435inline PackedVector::XMUBYTEN4::XMUBYTEN4
3436(
3437    float _x,
3438    float _y,
3439    float _z,
3440    float _w
3441)
3442{
3443    XMStoreUByteN4(this, XMVectorSet(_x, _y, _z, _w));
3444}
3445
3446//------------------------------------------------------------------------------
3447_Use_decl_annotations_
3448inline PackedVector::XMUBYTEN4::XMUBYTEN4
3449(
3450    const float* pArray
3451)
3452{
3453    XMStoreUByteN4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3454}
3455
3456/****************************************************************************
3457 *
3458 * XMUBYTE4 operators
3459 *
3460 ****************************************************************************/
3461
3462//------------------------------------------------------------------------------
3463
3464inline PackedVector::XMUBYTE4::XMUBYTE4
3465(
3466    float _x,
3467    float _y,
3468    float _z,
3469    float _w
3470)
3471{
3472    XMStoreUByte4(this, XMVectorSet(_x, _y, _z, _w));
3473}
3474
3475//------------------------------------------------------------------------------
3476_Use_decl_annotations_
3477inline PackedVector::XMUBYTE4::XMUBYTE4
3478(
3479    const float* pArray
3480)
3481{
3482    XMStoreUByte4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3483}
3484
3485/****************************************************************************
3486 *
3487 * XMUNIBBLE4 operators
3488 *
3489 ****************************************************************************/
3490
3491//------------------------------------------------------------------------------
3492
3493inline PackedVector::XMUNIBBLE4::XMUNIBBLE4
3494(
3495    float _x,
3496    float _y,
3497    float _z,
3498    float _w
3499)
3500{
3501    XMStoreUNibble4(this, XMVectorSet( _x, _y, _z, _w ));
3502}
3503
3504//------------------------------------------------------------------------------
3505_Use_decl_annotations_
3506inline PackedVector::XMUNIBBLE4::XMUNIBBLE4
3507(
3508    const float *pArray
3509)
3510{
3511    XMStoreUNibble4(this, XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(pArray)));
3512}
3513
3514/****************************************************************************
3515 *
3516 * XMU555 operators
3517 *
3518 ****************************************************************************/
3519
3520//------------------------------------------------------------------------------
3521
3522inline PackedVector::XMU555::XMU555
3523(
3524    float _x,
3525    float _y,
3526    float _z,
3527    bool _w
3528)
3529{
3530    XMStoreU555(this, XMVectorSet(_x, _y, _z, ((_w) ? 1.0f : 0.0f) ));
3531}
3532
3533//------------------------------------------------------------------------------
3534_Use_decl_annotations_
3535inline PackedVector::XMU555::XMU555
3536(
3537    const float *pArray,
3538    bool _w
3539)
3540{
3541    XMVECTOR V = XMLoadFloat3(reinterpret_cast<const XMFLOAT3*>(pArray));
3542    XMStoreU555(this, XMVectorSetW(V, ((_w) ? 1.0f : 0.0f) ));
3543}
3544
3545