the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
1//-------------------------------------------------------------------------------------
2// DirectXMathConvert.inl -- SIMD C++ Math library
3//
4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
7// PARTICULAR PURPOSE.
8//
9// Copyright (c) Microsoft Corporation. All rights reserved.
10//-------------------------------------------------------------------------------------
11
12#ifdef _MSC_VER
13#pragma once
14#endif
15
16/****************************************************************************
17 *
18 * Data conversion
19 *
20 ****************************************************************************/
21
22//------------------------------------------------------------------------------
23
24#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
25// For VMX128, these routines are all defines in the main header
26
27#pragma warning(push)
28#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
29
30inline XMVECTOR XMConvertVectorIntToFloat
31(
32 FXMVECTOR VInt,
33 uint32_t DivExponent
34)
35{
36 assert(DivExponent<32);
37#if defined(_XM_NO_INTRINSICS_)
38 float fScale = 1.0f / (float)(1U << DivExponent);
39 uint32_t ElementIndex = 0;
40 XMVECTOR Result;
41 do {
42 int32_t iTemp = (int32_t)VInt.vector4_u32[ElementIndex];
43 Result.vector4_f32[ElementIndex] = ((float)iTemp) * fScale;
44 } while (++ElementIndex<4);
45 return Result;
46#elif defined(_XM_ARM_NEON_INTRINSICS_)
47 __n128 vResult = vcvtq_f32_s32( VInt );
48 uint32_t uScale = 0x3F800000U - (DivExponent << 23);
49 __n128 vScale = vdupq_n_u32( uScale );
50 return vmulq_f32( vResult, vScale );
51#else // _XM_SSE_INTRINSICS_
52 // Convert to floats
53 XMVECTOR vResult = _mm_cvtepi32_ps(_mm_castps_si128(VInt));
54 // Convert DivExponent into 1.0f/(1<<DivExponent)
55 uint32_t uScale = 0x3F800000U - (DivExponent << 23);
56 // Splat the scalar value
57 __m128i vScale = _mm_set1_epi32(uScale);
58 vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(vScale));
59 return vResult;
60#endif
61}
62
63//------------------------------------------------------------------------------
64
65inline XMVECTOR XMConvertVectorFloatToInt
66(
67 FXMVECTOR VFloat,
68 uint32_t MulExponent
69)
70{
71 assert(MulExponent<32);
72#if defined(_XM_NO_INTRINSICS_)
73 // Get the scalar factor.
74 float fScale = (float)(1U << MulExponent);
75 uint32_t ElementIndex = 0;
76 XMVECTOR Result;
77 do {
78 int32_t iResult;
79 float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
80 if (fTemp <= -(65536.0f*32768.0f)) {
81 iResult = (-0x7FFFFFFF)-1;
82 } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
83 iResult = 0x7FFFFFFF;
84 } else {
85 iResult = (int32_t)fTemp;
86 }
87 Result.vector4_u32[ElementIndex] = (uint32_t)iResult;
88 } while (++ElementIndex<4);
89 return Result;
90#elif defined(_XM_ARM_NEON_INTRINSICS_)
91 __n128 vResult = vdupq_n_f32((float)(1U << MulExponent));
92 vResult = vmulq_f32(vResult,VFloat);
93 // In case of positive overflow, detect it
94 __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxInt);
95 // Float to int conversion
96 __n128 vResulti = vcvtq_s32_f32(vResult);
97 // If there was positive overflow, set to 0x7FFFFFFF
98 vResult = vandq_u32(vOverflow,g_XMAbsMask);
99 vOverflow = vbicq_u32(vResulti,vOverflow);
100 vOverflow = vorrq_u32(vOverflow,vResult);
101 return vOverflow;
102#else // _XM_SSE_INTRINSICS_
103 XMVECTOR vResult = _mm_set_ps1((float)(1U << MulExponent));
104 vResult = _mm_mul_ps(vResult,VFloat);
105 // In case of positive overflow, detect it
106 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
107 // Float to int conversion
108 __m128i vResulti = _mm_cvttps_epi32(vResult);
109 // If there was positive overflow, set to 0x7FFFFFFF
110 vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
111 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
112 vOverflow = _mm_or_ps(vOverflow,vResult);
113 return vOverflow;
114#endif
115}
116
117//------------------------------------------------------------------------------
118
119inline XMVECTOR XMConvertVectorUIntToFloat
120(
121 FXMVECTOR VUInt,
122 uint32_t DivExponent
123)
124{
125 assert(DivExponent<32);
126#if defined(_XM_NO_INTRINSICS_)
127 float fScale = 1.0f / (float)(1U << DivExponent);
128 uint32_t ElementIndex = 0;
129 XMVECTOR Result;
130 do {
131 Result.vector4_f32[ElementIndex] = (float)VUInt.vector4_u32[ElementIndex] * fScale;
132 } while (++ElementIndex<4);
133 return Result;
134#elif defined(_XM_ARM_NEON_INTRINSICS_)
135 __n128 vResult = vcvtq_f32_u32( VUInt );
136 uint32_t uScale = 0x3F800000U - (DivExponent << 23);
137 __n128 vScale = vdupq_n_u32( uScale );
138 return vmulq_f32( vResult, vScale );
139#else // _XM_SSE_INTRINSICS_
140 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
141 // Determine which ones need the fix.
142 XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
143 // Force all values positive
144 XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
145 // Convert to floats
146 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
147 // Convert 0x80000000 -> 0xFFFFFFFF
148 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
149 // For only the ones that are too big, add the fixup
150 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
151 vResult = _mm_add_ps(vResult,vMask);
152 // Convert DivExponent into 1.0f/(1<<DivExponent)
153 uint32_t uScale = 0x3F800000U - (DivExponent << 23);
154 // Splat
155 iMask = _mm_set1_epi32(uScale);
156 vResult = _mm_mul_ps(vResult,_mm_castsi128_ps(iMask));
157 return vResult;
158#endif
159}
160
161//------------------------------------------------------------------------------
162
163inline XMVECTOR XMConvertVectorFloatToUInt
164(
165 FXMVECTOR VFloat,
166 uint32_t MulExponent
167)
168{
169 assert(MulExponent<32);
170#if defined(_XM_NO_INTRINSICS_)
171 // Get the scalar factor.
172 float fScale = (float)(1U << MulExponent);
173 uint32_t ElementIndex = 0;
174 XMVECTOR Result;
175 do {
176 uint32_t uResult;
177 float fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
178 if (fTemp <= 0.0f) {
179 uResult = 0;
180 } else if (fTemp >= (65536.0f*65536.0f)) {
181 uResult = 0xFFFFFFFFU;
182 } else {
183 uResult = (uint32_t)fTemp;
184 }
185 Result.vector4_u32[ElementIndex] = uResult;
186 } while (++ElementIndex<4);
187 return Result;
188#elif defined(_XM_ARM_NEON_INTRINSICS_)
189 __n128 vResult = vdupq_n_f32((float)(1U << MulExponent));
190 vResult = vmulq_f32(vResult,VFloat);
191 // In case of overflow, detect it
192 __n128 vOverflow = vcgtq_f32(vResult,g_XMMaxUInt);
193 // Float to int conversion
194 __n128 vResulti = vcvtq_u32_f32(vResult);
195 // If there was overflow, set to 0xFFFFFFFFU
196 vResult = vbicq_u32(vResulti,vOverflow);
197 vOverflow = vorrq_u32(vOverflow,vResult);
198 return vOverflow;
199#else // _XM_SSE_INTRINSICS_
200 XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
201 vResult = _mm_mul_ps(vResult,VFloat);
202 // Clamp to >=0
203 vResult = _mm_max_ps(vResult,g_XMZero);
204 // Any numbers that are too big, set to 0xFFFFFFFFU
205 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
206 XMVECTOR vValue = g_XMUnsignedFix;
207 // Too large for a signed integer?
208 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
209 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
210 vValue = _mm_and_ps(vValue,vMask);
211 // Perform fixup only on numbers too large (Keeps low bit precision)
212 vResult = _mm_sub_ps(vResult,vValue);
213 __m128i vResulti = _mm_cvttps_epi32(vResult);
214 // Convert from signed to unsigned pnly if greater than 0x80000000
215 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
216 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
217 // On those that are too large, set to 0xFFFFFFFF
218 vResult = _mm_or_ps(vResult,vOverflow);
219 return vResult;
220#endif
221}
222
223#pragma warning(pop)
224
225#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_ || _XM_ARM_NEON_INTRINSICS_
226
227/****************************************************************************
228 *
229 * Vector and matrix load operations
230 *
231 ****************************************************************************/
232
233//------------------------------------------------------------------------------
234_Use_decl_annotations_
235inline XMVECTOR XMLoadInt(const uint32_t* pSource)
236{
237 assert(pSource);
238#if defined(_XM_NO_INTRINSICS_)
239 XMVECTOR V;
240 V.vector4_u32[0] = *pSource;
241 V.vector4_u32[1] = 0;
242 V.vector4_u32[2] = 0;
243 V.vector4_u32[3] = 0;
244 return V;
245#elif defined(_XM_ARM_NEON_INTRINSICS_)
246 __n128 zero = vdupq_n_u32(0);
247 return vld1q_lane_u32( pSource, zero, 0 );
248#elif defined(_XM_SSE_INTRINSICS_)
249 return _mm_load_ss( reinterpret_cast<const float*>(pSource) );
250#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
251#endif // _XM_VMX128_INTRINSICS_
252}
253
254//------------------------------------------------------------------------------
255_Use_decl_annotations_
256inline XMVECTOR XMLoadFloat(const float* pSource)
257{
258 assert(pSource);
259#if defined(_XM_NO_INTRINSICS_)
260 XMVECTOR V;
261 V.vector4_f32[0] = *pSource;
262 V.vector4_f32[1] = 0.f;
263 V.vector4_f32[2] = 0.f;
264 V.vector4_f32[3] = 0.f;
265 return V;
266#elif defined(_XM_ARM_NEON_INTRINSICS_)
267 __n128 zero = vdupq_n_u32(0);
268 return vld1q_lane_f32( pSource, zero, 0 );
269#elif defined(_XM_SSE_INTRINSICS_)
270 return _mm_load_ss( pSource );
271#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
272#endif // _XM_VMX128_INTRINSICS_
273}
274
275//------------------------------------------------------------------------------
276_Use_decl_annotations_
277inline XMVECTOR XMLoadInt2
278(
279 const uint32_t* pSource
280)
281{
282 assert(pSource);
283#if defined(_XM_NO_INTRINSICS_)
284 XMVECTOR V;
285 V.vector4_u32[0] = pSource[0];
286 V.vector4_u32[1] = pSource[1];
287 V.vector4_u32[2] = 0;
288 V.vector4_u32[3] = 0;
289 return V;
290#elif defined(_XM_ARM_NEON_INTRINSICS_)
291 __n64 x = vld1_u32( pSource );
292 __n64 zero = vdup_n_u32(0);
293 return vcombine_u32( x, zero );
294#elif defined(_XM_SSE_INTRINSICS_)
295 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
296 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
297 return _mm_unpacklo_ps( x, y );
298#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
299#endif // _XM_VMX128_INTRINSICS_
300}
301
302//------------------------------------------------------------------------------
303_Use_decl_annotations_
304inline XMVECTOR XMLoadInt2A
305(
306 const uint32_t* pSource
307)
308{
309 assert(pSource);
310 assert(((uintptr_t)pSource & 0xF) == 0);
311#if defined(_XM_NO_INTRINSICS_)
312 XMVECTOR V;
313 V.vector4_u32[0] = pSource[0];
314 V.vector4_u32[1] = pSource[1];
315 V.vector4_u32[2] = 0;
316 V.vector4_u32[3] = 0;
317 return V;
318#elif defined(_XM_ARM_NEON_INTRINSICS_)
319 __n64 x = vld1_u32_ex( pSource, 64 );
320 __n64 zero = vdup_n_u32(0);
321 return vcombine_u32( x, zero );
322#elif defined(_XM_SSE_INTRINSICS_)
323 __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
324 return reinterpret_cast<__m128 *>(&V)[0];
325#else // _XM_VMX128_INTRINSICS_
326#endif // _XM_VMX128_INTRINSICS_
327}
328
329//------------------------------------------------------------------------------
330_Use_decl_annotations_
331inline XMVECTOR XMLoadFloat2
332(
333 const XMFLOAT2* pSource
334)
335{
336 assert(pSource);
337#if defined(_XM_NO_INTRINSICS_)
338 XMVECTOR V;
339 V.vector4_f32[0] = pSource->x;
340 V.vector4_f32[1] = pSource->y;
341 V.vector4_f32[2] = 0.f;
342 V.vector4_f32[3] = 0.f;
343 return V;
344#elif defined(_XM_ARM_NEON_INTRINSICS_)
345 __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) );
346 __n64 zero = vdup_n_u32(0);
347 return vcombine_f32( x, zero );
348#elif defined(_XM_SSE_INTRINSICS_)
349 __m128 x = _mm_load_ss( &pSource->x );
350 __m128 y = _mm_load_ss( &pSource->y );
351 return _mm_unpacklo_ps( x, y );
352#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
353#endif // _XM_VMX128_INTRINSICS_
354}
355
356//------------------------------------------------------------------------------
357_Use_decl_annotations_
358inline XMVECTOR XMLoadFloat2A
359(
360 const XMFLOAT2A* pSource
361)
362{
363 assert(pSource);
364 assert(((uintptr_t)pSource & 0xF) == 0);
365#if defined(_XM_NO_INTRINSICS_)
366 XMVECTOR V;
367 V.vector4_f32[0] = pSource->x;
368 V.vector4_f32[1] = pSource->y;
369 V.vector4_f32[2] = 0.f;
370 V.vector4_f32[3] = 0.f;
371 return V;
372#elif defined(_XM_ARM_NEON_INTRINSICS_)
373 __n64 x = vld1_f32_ex( reinterpret_cast<const float*>(pSource), 64 );
374 __n64 zero = vdup_n_u32(0);
375 return vcombine_f32( x, zero );
376#elif defined(_XM_SSE_INTRINSICS_)
377 __m128i V = _mm_loadl_epi64( reinterpret_cast<const __m128i*>(pSource) );
378 return reinterpret_cast<__m128 *>(&V)[0];
379#else // _XM_VMX128_INTRINSICS_
380#endif // _XM_VMX128_INTRINSICS_
381}
382
383//------------------------------------------------------------------------------
384_Use_decl_annotations_
385inline XMVECTOR XMLoadSInt2
386(
387 const XMINT2* pSource
388)
389{
390 assert(pSource);
391#if defined(_XM_NO_INTRINSICS_)
392 XMVECTOR V;
393 V.vector4_f32[0] = (float)pSource->x;
394 V.vector4_f32[1] = (float)pSource->y;
395 V.vector4_f32[2] = 0.f;
396 V.vector4_f32[3] = 0.f;
397 return V;
398#elif defined(_XM_ARM_NEON_INTRINSICS_)
399 __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
400 __n64 v = vcvt_f32_s32( x );
401 __n64 zero = vdup_n_u32(0);
402 return vcombine_s32( v, zero );
403#elif defined(_XM_SSE_INTRINSICS_)
404 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
405 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
406 __m128 V = _mm_unpacklo_ps( x, y );
407 return _mm_cvtepi32_ps(_mm_castps_si128(V));
408#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
409#endif // _XM_VMX128_INTRINSICS_
410}
411
412//------------------------------------------------------------------------------
413_Use_decl_annotations_
414inline XMVECTOR XMLoadUInt2
415(
416 const XMUINT2* pSource
417)
418{
419 assert(pSource);
420#if defined(_XM_NO_INTRINSICS_)
421 XMVECTOR V;
422 V.vector4_f32[0] = (float)pSource->x;
423 V.vector4_f32[1] = (float)pSource->y;
424 V.vector4_f32[2] = 0.f;
425 V.vector4_f32[3] = 0.f;
426 return V;
427#elif defined(_XM_ARM_NEON_INTRINSICS_)
428 __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
429 __n64 v = vcvt_f32_u32( x );
430 __n64 zero = vdup_n_u32(0);
431 return vcombine_u32( v, zero );
432#elif defined(_XM_SSE_INTRINSICS_)
433 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
434 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
435 __m128 V = _mm_unpacklo_ps( x, y );
436 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
437 // Determine which ones need the fix.
438 XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
439 // Force all values positive
440 XMVECTOR vResult = _mm_xor_ps(V,vMask);
441 // Convert to floats
442 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
443 // Convert 0x80000000 -> 0xFFFFFFFF
444 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
445 // For only the ones that are too big, add the fixup
446 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
447 vResult = _mm_add_ps(vResult,vMask);
448 return vResult;
449#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
450#endif // _XM_VMX128_INTRINSICS_
451}
452
453//------------------------------------------------------------------------------
454_Use_decl_annotations_
455inline XMVECTOR XMLoadInt3
456(
457 const uint32_t* pSource
458)
459{
460 assert(pSource);
461#if defined(_XM_NO_INTRINSICS_)
462 XMVECTOR V;
463 V.vector4_u32[0] = pSource[0];
464 V.vector4_u32[1] = pSource[1];
465 V.vector4_u32[2] = pSource[2];
466 V.vector4_u32[3] = 0;
467 return V;
468#elif defined(_XM_ARM_NEON_INTRINSICS_)
469 __n64 x = vld1_u32( pSource );
470 __n64 zero = vdup_n_u32(0);
471 __n64 y = vld1_lane_u32( pSource+2, zero, 0 );
472 return vcombine_u32( x, y );
473#elif defined(_XM_SSE_INTRINSICS_)
474 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(pSource) );
475 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(pSource+1) );
476 __m128 z = _mm_load_ss( reinterpret_cast<const float*>(pSource+2) );
477 __m128 xy = _mm_unpacklo_ps( x, y );
478 return _mm_movelh_ps( xy, z );
479#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
480#endif // _XM_VMX128_INTRINSICS_
481}
482
483//------------------------------------------------------------------------------
484_Use_decl_annotations_
485inline XMVECTOR XMLoadInt3A
486(
487 const uint32_t* pSource
488)
489{
490 assert(pSource);
491 assert(((uintptr_t)pSource & 0xF) == 0);
492#if defined(_XM_NO_INTRINSICS_)
493 XMVECTOR V;
494 V.vector4_u32[0] = pSource[0];
495 V.vector4_u32[1] = pSource[1];
496 V.vector4_u32[2] = pSource[2];
497 V.vector4_u32[3] = 0;
498 return V;
499#elif defined(_XM_ARM_NEON_INTRINSICS_)
500 // Reads an extra integer which is zero'd
501 __n128 V = vld1q_u32_ex( pSource, 128 );
502 return vsetq_lane_u32( 0, V, 3 );
503#elif defined(_XM_SSE_INTRINSICS_)
504 // Reads an extra integer which is zero'd
505 __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
506 V = _mm_and_si128( V, g_XMMask3 );
507 return reinterpret_cast<__m128 *>(&V)[0];
508#else // _XM_VMX128_INTRINSICS_
509#endif // _XM_VMX128_INTRINSICS_
510}
511
512//------------------------------------------------------------------------------
513_Use_decl_annotations_
514inline XMVECTOR XMLoadFloat3
515(
516 const XMFLOAT3* pSource
517)
518{
519 assert(pSource);
520#if defined(_XM_NO_INTRINSICS_)
521 XMVECTOR V;
522 V.vector4_f32[0] = pSource->x;
523 V.vector4_f32[1] = pSource->y;
524 V.vector4_f32[2] = pSource->z;
525 V.vector4_f32[3] = 0.f;
526 return V;
527#elif defined(_XM_ARM_NEON_INTRINSICS_)
528 __n64 x = vld1_f32( reinterpret_cast<const float*>(pSource) );
529 __n64 zero = vdup_n_u32(0);
530 __n64 y = vld1_lane_f32( reinterpret_cast<const float*>(pSource)+2, zero, 0 );
531 return vcombine_f32( x, y );
532#elif defined(_XM_SSE_INTRINSICS_)
533 __m128 x = _mm_load_ss( &pSource->x );
534 __m128 y = _mm_load_ss( &pSource->y );
535 __m128 z = _mm_load_ss( &pSource->z );
536 __m128 xy = _mm_unpacklo_ps( x, y );
537 return _mm_movelh_ps( xy, z );
538#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
539#endif // _XM_VMX128_INTRINSICS_
540}
541
542//------------------------------------------------------------------------------
543_Use_decl_annotations_
544inline XMVECTOR XMLoadFloat3A
545(
546 const XMFLOAT3A* pSource
547)
548{
549 assert(pSource);
550 assert(((uintptr_t)pSource & 0xF) == 0);
551#if defined(_XM_NO_INTRINSICS_)
552 XMVECTOR V;
553 V.vector4_f32[0] = pSource->x;
554 V.vector4_f32[1] = pSource->y;
555 V.vector4_f32[2] = pSource->z;
556 V.vector4_f32[3] = 0.f;
557 return V;
558#elif defined(_XM_ARM_NEON_INTRINSICS_)
559 // Reads an extra float which is zero'd
560 __n128 V = vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
561 return vsetq_lane_f32( 0, V, 3 );
562#elif defined(_XM_SSE_INTRINSICS_)
563 // Reads an extra float which is zero'd
564 __m128 V = _mm_load_ps( &pSource->x );
565 return _mm_and_ps( V, g_XMMask3 );
566#else // _XM_VMX128_INTRINSICS_
567#endif // _XM_VMX128_INTRINSICS_
568}
569
570//------------------------------------------------------------------------------
571_Use_decl_annotations_
572inline XMVECTOR XMLoadSInt3
573(
574 const XMINT3* pSource
575)
576{
577 assert(pSource);
578#if defined(_XM_NO_INTRINSICS_)
579
580 XMVECTOR V;
581 V.vector4_f32[0] = (float)pSource->x;
582 V.vector4_f32[1] = (float)pSource->y;
583 V.vector4_f32[2] = (float)pSource->z;
584 V.vector4_f32[3] = 0.f;
585 return V;
586
587#elif defined(_XM_ARM_NEON_INTRINSICS_)
588 __n64 x = vld1_s32( reinterpret_cast<const int32_t*>(pSource) );
589 __n64 zero = vdup_n_u32(0);
590 __n64 y = vld1_lane_s32( reinterpret_cast<const int32_t*>(pSource)+2, zero, 0 );
591 __n128 v = vcombine_s32( x, y );
592 return vcvtq_f32_s32( v );
593#elif defined(_XM_SSE_INTRINSICS_)
594 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
595 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
596 __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
597 __m128 xy = _mm_unpacklo_ps( x, y );
598 __m128 V = _mm_movelh_ps( xy, z );
599 return _mm_cvtepi32_ps(_mm_castps_si128(V));
600#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
601#endif // _XM_VMX128_INTRINSICS_
602}
603
604//------------------------------------------------------------------------------
605_Use_decl_annotations_
606inline XMVECTOR XMLoadUInt3
607(
608 const XMUINT3* pSource
609)
610{
611 assert(pSource);
612#if defined(_XM_NO_INTRINSICS_)
613 XMVECTOR V;
614 V.vector4_f32[0] = (float)pSource->x;
615 V.vector4_f32[1] = (float)pSource->y;
616 V.vector4_f32[2] = (float)pSource->z;
617 V.vector4_f32[3] = 0.f;
618 return V;
619#elif defined(_XM_ARM_NEON_INTRINSICS_)
620 __n64 x = vld1_u32( reinterpret_cast<const uint32_t*>(pSource) );
621 __n64 zero = vdup_n_u32(0);
622 __n64 y = vld1_lane_u32( reinterpret_cast<const uint32_t*>(pSource)+2, zero, 0 );
623 __n128 v = vcombine_u32( x, y );
624 return vcvtq_f32_u32( v );
625#elif defined(_XM_SSE_INTRINSICS_)
626 __m128 x = _mm_load_ss( reinterpret_cast<const float*>(&pSource->x) );
627 __m128 y = _mm_load_ss( reinterpret_cast<const float*>(&pSource->y) );
628 __m128 z = _mm_load_ss( reinterpret_cast<const float*>(&pSource->z) );
629 __m128 xy = _mm_unpacklo_ps( x, y );
630 __m128 V = _mm_movelh_ps( xy, z );
631 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
632 // Determine which ones need the fix.
633 XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
634 // Force all values positive
635 XMVECTOR vResult = _mm_xor_ps(V,vMask);
636 // Convert to floats
637 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
638 // Convert 0x80000000 -> 0xFFFFFFFF
639 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
640 // For only the ones that are too big, add the fixup
641 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
642 vResult = _mm_add_ps(vResult,vMask);
643 return vResult;
644
645#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
646#endif // _XM_VMX128_INTRINSICS_
647}
648
649//------------------------------------------------------------------------------
650_Use_decl_annotations_
651inline XMVECTOR XMLoadInt4
652(
653 const uint32_t* pSource
654)
655{
656 assert(pSource);
657
658#if defined(_XM_NO_INTRINSICS_)
659 XMVECTOR V;
660 V.vector4_u32[0] = pSource[0];
661 V.vector4_u32[1] = pSource[1];
662 V.vector4_u32[2] = pSource[2];
663 V.vector4_u32[3] = pSource[3];
664 return V;
665#elif defined(_XM_ARM_NEON_INTRINSICS_)
666 return vld1q_u32( pSource );
667#elif defined(_XM_SSE_INTRINSICS_)
668 __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
669 return reinterpret_cast<__m128 *>(&V)[0];
670#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
671#endif // _XM_VMX128_INTRINSICS_
672}
673
674//------------------------------------------------------------------------------
675_Use_decl_annotations_
676inline XMVECTOR XMLoadInt4A
677(
678 const uint32_t* pSource
679)
680{
681 assert(pSource);
682 assert(((uintptr_t)pSource & 0xF) == 0);
683#if defined(_XM_NO_INTRINSICS_)
684 XMVECTOR V;
685 V.vector4_u32[0] = pSource[0];
686 V.vector4_u32[1] = pSource[1];
687 V.vector4_u32[2] = pSource[2];
688 V.vector4_u32[3] = pSource[3];
689 return V;
690#elif defined(_XM_ARM_NEON_INTRINSICS_)
691 return vld1q_u32_ex( pSource, 128 );
692#elif defined(_XM_SSE_INTRINSICS_)
693 __m128i V = _mm_load_si128( reinterpret_cast<const __m128i*>(pSource) );
694 return reinterpret_cast<__m128 *>(&V)[0];
695#else // _XM_VMX128_INTRINSICS_
696#endif // _XM_VMX128_INTRINSICS_
697}
698
699//------------------------------------------------------------------------------
700_Use_decl_annotations_
701inline XMVECTOR XMLoadFloat4
702(
703 const XMFLOAT4* pSource
704)
705{
706 assert(pSource);
707#if defined(_XM_NO_INTRINSICS_)
708 XMVECTOR V;
709 V.vector4_f32[0] = pSource->x;
710 V.vector4_f32[1] = pSource->y;
711 V.vector4_f32[2] = pSource->z;
712 V.vector4_f32[3] = pSource->w;
713 return V;
714#elif defined(_XM_ARM_NEON_INTRINSICS_)
715 return vld1q_f32( reinterpret_cast<const float*>(pSource) );
716#elif defined(_XM_SSE_INTRINSICS_)
717 return _mm_loadu_ps( &pSource->x );
718#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
719#endif // _XM_VMX128_INTRINSICS_
720}
721
722//------------------------------------------------------------------------------
723_Use_decl_annotations_
724inline XMVECTOR XMLoadFloat4A
725(
726 const XMFLOAT4A* pSource
727)
728{
729 assert(pSource);
730 assert(((uintptr_t)pSource & 0xF) == 0);
731#if defined(_XM_NO_INTRINSICS_)
732 XMVECTOR V;
733 V.vector4_f32[0] = pSource->x;
734 V.vector4_f32[1] = pSource->y;
735 V.vector4_f32[2] = pSource->z;
736 V.vector4_f32[3] = pSource->w;
737 return V;
738#elif defined(_XM_ARM_NEON_INTRINSICS_)
739 return vld1q_f32_ex( reinterpret_cast<const float*>(pSource), 128 );
740#elif defined(_XM_SSE_INTRINSICS_)
741 return _mm_load_ps( &pSource->x );
742#else // _XM_VMX128_INTRINSICS_
743#endif // _XM_VMX128_INTRINSICS_
744}
745
746//------------------------------------------------------------------------------
747_Use_decl_annotations_
748inline XMVECTOR XMLoadSInt4
749(
750 const XMINT4* pSource
751)
752{
753 assert(pSource);
754#if defined(_XM_NO_INTRINSICS_)
755
756 XMVECTOR V;
757 V.vector4_f32[0] = (float)pSource->x;
758 V.vector4_f32[1] = (float)pSource->y;
759 V.vector4_f32[2] = (float)pSource->z;
760 V.vector4_f32[3] = (float)pSource->w;
761 return V;
762
763#elif defined(_XM_ARM_NEON_INTRINSICS_)
764 __n128 v = vld1q_s32( reinterpret_cast<const int32_t*>(pSource) );
765 return vcvtq_f32_s32( v );
766#elif defined(_XM_SSE_INTRINSICS_)
767 __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
768 return _mm_cvtepi32_ps(V);
769#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
770#endif // _XM_VMX128_INTRINSICS_
771}
772
773//------------------------------------------------------------------------------
774_Use_decl_annotations_
775inline XMVECTOR XMLoadUInt4
776(
777 const XMUINT4* pSource
778)
779{
780 assert(pSource);
781#if defined(_XM_NO_INTRINSICS_)
782 XMVECTOR V;
783 V.vector4_f32[0] = (float)pSource->x;
784 V.vector4_f32[1] = (float)pSource->y;
785 V.vector4_f32[2] = (float)pSource->z;
786 V.vector4_f32[3] = (float)pSource->w;
787 return V;
788#elif defined(_XM_ARM_NEON_INTRINSICS_)
789 __n128 v = vld1q_u32( reinterpret_cast<const uint32_t*>(pSource) );
790 return vcvtq_f32_u32( v );
791#elif defined(_XM_SSE_INTRINSICS_)
792 __m128i V = _mm_loadu_si128( reinterpret_cast<const __m128i*>(pSource) );
793 // For the values that are higher than 0x7FFFFFFF, a fixup is needed
794 // Determine which ones need the fix.
795 XMVECTOR vMask = _mm_and_ps(_mm_castsi128_ps(V),g_XMNegativeZero);
796 // Force all values positive
797 XMVECTOR vResult = _mm_xor_ps(_mm_castsi128_ps(V),vMask);
798 // Convert to floats
799 vResult = _mm_cvtepi32_ps(_mm_castps_si128(vResult));
800 // Convert 0x80000000 -> 0xFFFFFFFF
801 __m128i iMask = _mm_srai_epi32(_mm_castps_si128(vMask),31);
802 // For only the ones that are too big, add the fixup
803 vMask = _mm_and_ps(_mm_castsi128_ps(iMask),g_XMFixUnsigned);
804 vResult = _mm_add_ps(vResult,vMask);
805 return vResult;
806#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
807#endif // _XM_VMX128_INTRINSICS_
808}
809
810//------------------------------------------------------------------------------
811_Use_decl_annotations_
812inline XMMATRIX XMLoadFloat3x3
813(
814 const XMFLOAT3X3* pSource
815)
816{
817 assert(pSource);
818#if defined(_XM_NO_INTRINSICS_)
819
820 XMMATRIX M;
821 M.r[0].vector4_f32[0] = pSource->m[0][0];
822 M.r[0].vector4_f32[1] = pSource->m[0][1];
823 M.r[0].vector4_f32[2] = pSource->m[0][2];
824 M.r[0].vector4_f32[3] = 0.0f;
825
826 M.r[1].vector4_f32[0] = pSource->m[1][0];
827 M.r[1].vector4_f32[1] = pSource->m[1][1];
828 M.r[1].vector4_f32[2] = pSource->m[1][2];
829 M.r[1].vector4_f32[3] = 0.0f;
830
831 M.r[2].vector4_f32[0] = pSource->m[2][0];
832 M.r[2].vector4_f32[1] = pSource->m[2][1];
833 M.r[2].vector4_f32[2] = pSource->m[2][2];
834 M.r[2].vector4_f32[3] = 0.0f;
835 M.r[3].vector4_f32[0] = 0.0f;
836 M.r[3].vector4_f32[1] = 0.0f;
837 M.r[3].vector4_f32[2] = 0.0f;
838 M.r[3].vector4_f32[3] = 1.0f;
839 return M;
840
841#elif defined(_XM_ARM_NEON_INTRINSICS_)
842 __n128 v0 = vld1q_f32( &pSource->m[0][0] );
843 __n128 v1 = vld1q_f32( &pSource->m[1][1] );
844 __n64 v2 = vcreate_f32( (uint64_t)*(const uint32_t*)&pSource->m[2][2] );
845 __n128 T = vextq_f32( v0, v1, 3 );
846
847 XMMATRIX M;
848 M.r[0] = vandq_u32( v0, g_XMMask3 );
849 M.r[1] = vandq_u32( T, g_XMMask3 );
850 M.r[2] = vcombine_f32( vget_high_f32(v1), v2 );
851 M.r[3] = g_XMIdentityR3;
852 return M;
853#elif defined(_XM_SSE_INTRINSICS_)
854 __m128 Z = _mm_setzero_ps();
855
856 __m128 V1 = _mm_loadu_ps( &pSource->m[0][0] );
857 __m128 V2 = _mm_loadu_ps( &pSource->m[1][1] );
858 __m128 V3 = _mm_load_ss( &pSource->m[2][2] );
859
860 __m128 T1 = _mm_unpackhi_ps( V1, Z );
861 __m128 T2 = _mm_unpacklo_ps( V2, Z );
862 __m128 T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
863 __m128 T4 = _mm_movehl_ps( T2, T3 );
864 __m128 T5 = _mm_movehl_ps( Z, T1 );
865
866 XMMATRIX M;
867 M.r[0] = _mm_movelh_ps( V1, T1 );
868 M.r[1] = _mm_add_ps( T4, T5 );
869 M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
870 M.r[3] = g_XMIdentityR3;
871 return M;
872#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
873#endif // _XM_VMX128_INTRINSICS_
874}
875
876//------------------------------------------------------------------------------
877_Use_decl_annotations_
878inline XMMATRIX XMLoadFloat4x3
879(
880 const XMFLOAT4X3* pSource
881)
882{
883 assert(pSource);
884#if defined(_XM_NO_INTRINSICS_)
885
886 XMMATRIX M;
887 M.r[0].vector4_f32[0] = pSource->m[0][0];
888 M.r[0].vector4_f32[1] = pSource->m[0][1];
889 M.r[0].vector4_f32[2] = pSource->m[0][2];
890 M.r[0].vector4_f32[3] = 0.0f;
891
892 M.r[1].vector4_f32[0] = pSource->m[1][0];
893 M.r[1].vector4_f32[1] = pSource->m[1][1];
894 M.r[1].vector4_f32[2] = pSource->m[1][2];
895 M.r[1].vector4_f32[3] = 0.0f;
896
897 M.r[2].vector4_f32[0] = pSource->m[2][0];
898 M.r[2].vector4_f32[1] = pSource->m[2][1];
899 M.r[2].vector4_f32[2] = pSource->m[2][2];
900 M.r[2].vector4_f32[3] = 0.0f;
901
902 M.r[3].vector4_f32[0] = pSource->m[3][0];
903 M.r[3].vector4_f32[1] = pSource->m[3][1];
904 M.r[3].vector4_f32[2] = pSource->m[3][2];
905 M.r[3].vector4_f32[3] = 1.0f;
906 return M;
907
908#elif defined(_XM_ARM_NEON_INTRINSICS_)
909 __n128 v0 = vld1q_f32( &pSource->m[0][0] );
910 __n128 v1 = vld1q_f32( &pSource->m[1][1] );
911 __n128 v2 = vld1q_f32( &pSource->m[2][2] );
912
913 __n128 T1 = vextq_f32( v0, v1, 3 );
914 __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
915 __n128 T3 = vextq_f32( v2, v2, 1 );
916
917 XMMATRIX M;
918 M.r[0] = vandq_u32( v0, g_XMMask3 );
919 M.r[1] = vandq_u32( T1, g_XMMask3 );
920 M.r[2] = vandq_u32( T2, g_XMMask3 );
921 M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
922 return M;
923#elif defined(_XM_SSE_INTRINSICS_)
924 // Use unaligned load instructions to
925 // load the 12 floats
926 // vTemp1 = x1,y1,z1,x2
927 XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
928 // vTemp2 = y2,z2,x3,y3
929 XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
930 // vTemp4 = z3,x4,y4,z4
931 XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
932 // vTemp3 = x3,y3,z3,z3
933 XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
934 // vTemp2 = y2,z2,x2,x2
935 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
936 // vTemp2 = x2,y2,z2,z2
937 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
938 // vTemp1 = x1,y1,z1,0
939 vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
940 // vTemp2 = x2,y2,z2,0
941 vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
942 // vTemp3 = x3,y3,z3,0
943 vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
944 // vTemp4i = x4,y4,z4,0
945 __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
946 // vTemp4i = x4,y4,z4,1.0f
947 vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
948 XMMATRIX M(vTemp1,
949 vTemp2,
950 vTemp3,
951 _mm_castsi128_ps(vTemp4i));
952 return M;
953#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
954#endif // _XM_VMX128_INTRINSICS_
955}
956
957//------------------------------------------------------------------------------
958_Use_decl_annotations_
959inline XMMATRIX XMLoadFloat4x3A
960(
961 const XMFLOAT4X3A* pSource
962)
963{
964 assert(pSource);
965 assert(((uintptr_t)pSource & 0xF) == 0);
966#if defined(_XM_NO_INTRINSICS_)
967
968 XMMATRIX M;
969 M.r[0].vector4_f32[0] = pSource->m[0][0];
970 M.r[0].vector4_f32[1] = pSource->m[0][1];
971 M.r[0].vector4_f32[2] = pSource->m[0][2];
972 M.r[0].vector4_f32[3] = 0.0f;
973
974 M.r[1].vector4_f32[0] = pSource->m[1][0];
975 M.r[1].vector4_f32[1] = pSource->m[1][1];
976 M.r[1].vector4_f32[2] = pSource->m[1][2];
977 M.r[1].vector4_f32[3] = 0.0f;
978
979 M.r[2].vector4_f32[0] = pSource->m[2][0];
980 M.r[2].vector4_f32[1] = pSource->m[2][1];
981 M.r[2].vector4_f32[2] = pSource->m[2][2];
982 M.r[2].vector4_f32[3] = 0.0f;
983
984 M.r[3].vector4_f32[0] = pSource->m[3][0];
985 M.r[3].vector4_f32[1] = pSource->m[3][1];
986 M.r[3].vector4_f32[2] = pSource->m[3][2];
987 M.r[3].vector4_f32[3] = 1.0f;
988 return M;
989
990#elif defined(_XM_ARM_NEON_INTRINSICS_)
991 __n128 v0 = vld1q_f32_ex( &pSource->m[0][0], 128 );
992 __n128 v1 = vld1q_f32_ex( &pSource->m[1][1], 128 );
993 __n128 v2 = vld1q_f32_ex( &pSource->m[2][2], 128 );
994
995 __n128 T1 = vextq_f32( v0, v1, 3 );
996 __n128 T2 = vcombine_f32( vget_high_f32(v1), vget_low_f32(v2) );
997 __n128 T3 = vextq_f32( v2, v2, 1 );
998
999 XMMATRIX M;
1000 M.r[0] = vandq_u32( v0, g_XMMask3 );
1001 M.r[1] = vandq_u32( T1, g_XMMask3 );
1002 M.r[2] = vandq_u32( T2, g_XMMask3 );
1003 M.r[3] = vsetq_lane_f32( 1.f, T3, 3 );
1004 return M;
1005#elif defined(_XM_SSE_INTRINSICS_)
1006 // Use aligned load instructions to
1007 // load the 12 floats
1008 // vTemp1 = x1,y1,z1,x2
1009 XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
1010 // vTemp2 = y2,z2,x3,y3
1011 XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
1012 // vTemp4 = z3,x4,y4,z4
1013 XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
1014 // vTemp3 = x3,y3,z3,z3
1015 XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
1016 // vTemp2 = y2,z2,x2,x2
1017 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
1018 // vTemp2 = x2,y2,z2,z2
1019 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(1,1,0,2));
1020 // vTemp1 = x1,y1,z1,0
1021 vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
1022 // vTemp2 = x2,y2,z2,0
1023 vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
1024 // vTemp3 = x3,y3,z3,0
1025 vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
1026 // vTemp4i = x4,y4,z4,0
1027 __m128i vTemp4i = _mm_srli_si128(_mm_castps_si128(vTemp4),32/8);
1028 // vTemp4i = x4,y4,z4,1.0f
1029 vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
1030 XMMATRIX M(vTemp1,
1031 vTemp2,
1032 vTemp3,
1033 _mm_castsi128_ps(vTemp4i));
1034 return M;
1035#else // _XM_VMX128_INTRINSICS_
1036#endif // _XM_VMX128_INTRINSICS_
1037}
1038
1039//------------------------------------------------------------------------------
1040_Use_decl_annotations_
1041inline XMMATRIX XMLoadFloat4x4
1042(
1043 const XMFLOAT4X4* pSource
1044)
1045{
1046 assert(pSource);
1047#if defined(_XM_NO_INTRINSICS_)
1048
1049 XMMATRIX M;
1050 M.r[0].vector4_f32[0] = pSource->m[0][0];
1051 M.r[0].vector4_f32[1] = pSource->m[0][1];
1052 M.r[0].vector4_f32[2] = pSource->m[0][2];
1053 M.r[0].vector4_f32[3] = pSource->m[0][3];
1054
1055 M.r[1].vector4_f32[0] = pSource->m[1][0];
1056 M.r[1].vector4_f32[1] = pSource->m[1][1];
1057 M.r[1].vector4_f32[2] = pSource->m[1][2];
1058 M.r[1].vector4_f32[3] = pSource->m[1][3];
1059
1060 M.r[2].vector4_f32[0] = pSource->m[2][0];
1061 M.r[2].vector4_f32[1] = pSource->m[2][1];
1062 M.r[2].vector4_f32[2] = pSource->m[2][2];
1063 M.r[2].vector4_f32[3] = pSource->m[2][3];
1064
1065 M.r[3].vector4_f32[0] = pSource->m[3][0];
1066 M.r[3].vector4_f32[1] = pSource->m[3][1];
1067 M.r[3].vector4_f32[2] = pSource->m[3][2];
1068 M.r[3].vector4_f32[3] = pSource->m[3][3];
1069 return M;
1070
1071#elif defined(_XM_ARM_NEON_INTRINSICS_)
1072 XMMATRIX M;
1073 M.r[0] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_11) );
1074 M.r[1] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_21) );
1075 M.r[2] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_31) );
1076 M.r[3] = vld1q_f32( reinterpret_cast<const float*>(&pSource->_41) );
1077 return M;
1078#elif defined(_XM_SSE_INTRINSICS_)
1079 XMMATRIX M;
1080 M.r[0] = _mm_loadu_ps( &pSource->_11 );
1081 M.r[1] = _mm_loadu_ps( &pSource->_21 );
1082 M.r[2] = _mm_loadu_ps( &pSource->_31 );
1083 M.r[3] = _mm_loadu_ps( &pSource->_41 );
1084 return M;
1085#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1086#endif // _XM_VMX128_INTRINSICS_
1087}
1088
1089//------------------------------------------------------------------------------
1090_Use_decl_annotations_
1091inline XMMATRIX XMLoadFloat4x4A
1092(
1093 const XMFLOAT4X4A* pSource
1094)
1095{
1096 assert(pSource);
1097 assert(((uintptr_t)pSource & 0xF) == 0);
1098#if defined(_XM_NO_INTRINSICS_)
1099
1100 XMMATRIX M;
1101 M.r[0].vector4_f32[0] = pSource->m[0][0];
1102 M.r[0].vector4_f32[1] = pSource->m[0][1];
1103 M.r[0].vector4_f32[2] = pSource->m[0][2];
1104 M.r[0].vector4_f32[3] = pSource->m[0][3];
1105
1106 M.r[1].vector4_f32[0] = pSource->m[1][0];
1107 M.r[1].vector4_f32[1] = pSource->m[1][1];
1108 M.r[1].vector4_f32[2] = pSource->m[1][2];
1109 M.r[1].vector4_f32[3] = pSource->m[1][3];
1110
1111 M.r[2].vector4_f32[0] = pSource->m[2][0];
1112 M.r[2].vector4_f32[1] = pSource->m[2][1];
1113 M.r[2].vector4_f32[2] = pSource->m[2][2];
1114 M.r[2].vector4_f32[3] = pSource->m[2][3];
1115
1116 M.r[3].vector4_f32[0] = pSource->m[3][0];
1117 M.r[3].vector4_f32[1] = pSource->m[3][1];
1118 M.r[3].vector4_f32[2] = pSource->m[3][2];
1119 M.r[3].vector4_f32[3] = pSource->m[3][3];
1120 return M;
1121
1122#elif defined(_XM_ARM_NEON_INTRINSICS_)
1123 XMMATRIX M;
1124 M.r[0] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_11), 128 );
1125 M.r[1] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_21), 128 );
1126 M.r[2] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_31), 128 );
1127 M.r[3] = vld1q_f32_ex( reinterpret_cast<const float*>(&pSource->_41), 128 );
1128 return M;
1129#elif defined(_XM_SSE_INTRINSICS_)
1130 XMMATRIX M;
1131 M.r[0] = _mm_load_ps( &pSource->_11 );
1132 M.r[1] = _mm_load_ps( &pSource->_21 );
1133 M.r[2] = _mm_load_ps( &pSource->_31 );
1134 M.r[3] = _mm_load_ps( &pSource->_41 );
1135 return M;
1136#else // _XM_VMX128_INTRINSICS_
1137#endif // _XM_VMX128_INTRINSICS_
1138}
1139
1140/****************************************************************************
1141 *
1142 * Vector and matrix store operations
1143 *
1144 ****************************************************************************/
1145_Use_decl_annotations_
1146inline void XMStoreInt
1147(
1148 uint32_t* pDestination,
1149 FXMVECTOR V
1150)
1151{
1152 assert(pDestination);
1153#if defined(_XM_NO_INTRINSICS_)
1154 *pDestination = XMVectorGetIntX( V );
1155#elif defined(_XM_ARM_NEON_INTRINSICS_)
1156 vst1q_lane_u32( pDestination, V, 0 );
1157#elif defined(_XM_SSE_INTRINSICS_)
1158 _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
1159#else // _XM_VMX128_INTRINSICS_
1160#endif // _XM_VMX128_INTRINSICS_
1161}
1162
1163//------------------------------------------------------------------------------
1164_Use_decl_annotations_
1165inline void XMStoreFloat
1166(
1167 float* pDestination,
1168 FXMVECTOR V
1169)
1170{
1171 assert(pDestination);
1172#if defined(_XM_NO_INTRINSICS_)
1173 *pDestination = XMVectorGetX( V );
1174#elif defined(_XM_ARM_NEON_INTRINSICS_)
1175 vst1q_lane_f32( pDestination, V, 0 );
1176#elif defined(_XM_SSE_INTRINSICS_)
1177 _mm_store_ss( pDestination, V );
1178#else // _XM_VMX128_INTRINSICS_
1179#endif // _XM_VMX128_INTRINSICS_
1180}
1181
1182//------------------------------------------------------------------------------
1183_Use_decl_annotations_
1184inline void XMStoreInt2
1185(
1186 uint32_t* pDestination,
1187 FXMVECTOR V
1188)
1189{
1190 assert(pDestination);
1191#if defined(_XM_NO_INTRINSICS_)
1192 pDestination[0] = V.vector4_u32[0];
1193 pDestination[1] = V.vector4_u32[1];
1194#elif defined(_XM_ARM_NEON_INTRINSICS_)
1195 __n64 VL = vget_low_u32(V);
1196 vst1_u32( pDestination, VL );
1197#elif defined(_XM_SSE_INTRINSICS_)
1198 XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1199 _mm_store_ss( reinterpret_cast<float*>(&pDestination[0]), V );
1200 _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T );
1201#else // _XM_VMX128_INTRINSICS_
1202#endif // _XM_VMX128_INTRINSICS_
1203}
1204
1205//------------------------------------------------------------------------------
1206_Use_decl_annotations_
1207inline void XMStoreInt2A
1208(
1209 uint32_t* pDestination,
1210 FXMVECTOR V
1211)
1212{
1213 assert(pDestination);
1214 assert(((uintptr_t)pDestination & 0xF) == 0);
1215#if defined(_XM_NO_INTRINSICS_)
1216 pDestination[0] = V.vector4_u32[0];
1217 pDestination[1] = V.vector4_u32[1];
1218#elif defined(_XM_ARM_NEON_INTRINSICS_)
1219 __n64 VL = vget_low_u32(V);
1220 vst1_u32_ex( pDestination, VL, 64 );
1221#elif defined(_XM_SSE_INTRINSICS_)
1222 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1223#else // _XM_VMX128_INTRINSICS_
1224#endif // _XM_VMX128_INTRINSICS_
1225}
1226
1227//------------------------------------------------------------------------------
1228_Use_decl_annotations_
1229inline void XMStoreFloat2
1230(
1231 XMFLOAT2* pDestination,
1232 FXMVECTOR V
1233)
1234{
1235 assert(pDestination);
1236#if defined(_XM_NO_INTRINSICS_)
1237 pDestination->x = V.vector4_f32[0];
1238 pDestination->y = V.vector4_f32[1];
1239#elif defined(_XM_ARM_NEON_INTRINSICS_)
1240 __n64 VL = vget_low_f32(V);
1241 vst1_f32( reinterpret_cast<float*>(pDestination), VL );
1242#elif defined(_XM_SSE_INTRINSICS_)
1243 XMVECTOR T = XM_PERMUTE_PS( V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1244 _mm_store_ss( &pDestination->x, V );
1245 _mm_store_ss( &pDestination->y, T );
1246#else // _XM_VMX128_INTRINSICS_
1247#endif // _XM_VMX128_INTRINSICS_
1248}
1249
1250//------------------------------------------------------------------------------
1251_Use_decl_annotations_
1252inline void XMStoreFloat2A
1253(
1254 XMFLOAT2A* pDestination,
1255 FXMVECTOR V
1256)
1257{
1258 assert(pDestination);
1259 assert(((uintptr_t)pDestination & 0xF) == 0);
1260#if defined(_XM_NO_INTRINSICS_)
1261 pDestination->x = V.vector4_f32[0];
1262 pDestination->y = V.vector4_f32[1];
1263#elif defined(_XM_ARM_NEON_INTRINSICS_)
1264 __n64 VL = vget_low_f32(V);
1265 vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
1266#elif defined(_XM_SSE_INTRINSICS_)
1267 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1268#else // _XM_VMX128_INTRINSICS_
1269#endif // _XM_VMX128_INTRINSICS_
1270}
1271
1272//------------------------------------------------------------------------------
1273_Use_decl_annotations_
1274inline void XMStoreSInt2
1275(
1276 XMINT2* pDestination,
1277 FXMVECTOR V
1278)
1279{
1280 assert(pDestination);
1281#if defined(_XM_NO_INTRINSICS_)
1282 pDestination->x = (int32_t)V.vector4_f32[0];
1283 pDestination->y = (int32_t)V.vector4_f32[1];
1284#elif defined(_XM_ARM_NEON_INTRINSICS_)
1285 __n64 v = vget_low_s32(V);
1286 v = vcvt_s32_f32( v );
1287 vst1_s32( reinterpret_cast<int32_t*>(pDestination), v );
1288#elif defined(_XM_SSE_INTRINSICS_)
1289 // In case of positive overflow, detect it
1290 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
1291 // Float to int conversion
1292 __m128i vResulti = _mm_cvttps_epi32(V);
1293 // If there was positive overflow, set to 0x7FFFFFFF
1294 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
1295 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
1296 vOverflow = _mm_or_ps(vOverflow,vResult);
1297 // Write two ints
1298 XMVECTOR T = XM_PERMUTE_PS( vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1299 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
1300 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
1301#else // _XM_VMX128_INTRINSICS_
1302#endif // _XM_VMX128_INTRINSICS_
1303}
1304
1305//------------------------------------------------------------------------------
1306_Use_decl_annotations_
1307inline void XMStoreUInt2
1308(
1309 XMUINT2* pDestination,
1310 FXMVECTOR V
1311)
1312{
1313 assert(pDestination);
1314#if defined(_XM_NO_INTRINSICS_)
1315 pDestination->x = (uint32_t)V.vector4_f32[0];
1316 pDestination->y = (uint32_t)V.vector4_f32[1];
1317#elif defined(_XM_ARM_NEON_INTRINSICS_)
1318 __n64 v = vget_low_u32(V);
1319 v = vcvt_u32_f32( v );
1320 vst1_u32( reinterpret_cast<uint32_t*>(pDestination), v );
1321#elif defined(_XM_SSE_INTRINSICS_)
1322 // Clamp to >=0
1323 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1324 // Any numbers that are too big, set to 0xFFFFFFFFU
1325 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
1326 XMVECTOR vValue = g_XMUnsignedFix;
1327 // Too large for a signed integer?
1328 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
1329 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
1330 vValue = _mm_and_ps(vValue,vMask);
1331 // Perform fixup only on numbers too large (Keeps low bit precision)
1332 vResult = _mm_sub_ps(vResult,vValue);
1333 __m128i vResulti = _mm_cvttps_epi32(vResult);
1334 // Convert from signed to unsigned pnly if greater than 0x80000000
1335 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
1336 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
1337 // On those that are too large, set to 0xFFFFFFFF
1338 vResult = _mm_or_ps(vResult,vOverflow);
1339 // Write two uints
1340 XMVECTOR T = XM_PERMUTE_PS( vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
1341 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
1342 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T );
1343#else // _XM_VMX128_INTRINSICS_
1344#endif // _XM_VMX128_INTRINSICS_
1345}
1346
1347//------------------------------------------------------------------------------
1348_Use_decl_annotations_
1349inline void XMStoreInt3
1350(
1351 uint32_t* pDestination,
1352 FXMVECTOR V
1353)
1354{
1355 assert(pDestination);
1356#if defined(_XM_NO_INTRINSICS_)
1357 pDestination[0] = V.vector4_u32[0];
1358 pDestination[1] = V.vector4_u32[1];
1359 pDestination[2] = V.vector4_u32[2];
1360#elif defined(_XM_ARM_NEON_INTRINSICS_)
1361 __n64 VL = vget_low_u32(V);
1362 vst1_u32( pDestination, VL );
1363 vst1q_lane_u32( pDestination+2, V, 2 );
1364#elif defined(_XM_SSE_INTRINSICS_)
1365 XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
1366 XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1367 _mm_store_ss( reinterpret_cast<float*>(pDestination), V );
1368 _mm_store_ss( reinterpret_cast<float*>(&pDestination[1]), T1 );
1369 _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T2 );
1370#else // _XM_VMX128_INTRINSICS_
1371#endif // _XM_VMX128_INTRINSICS_
1372}
1373
1374//------------------------------------------------------------------------------
1375_Use_decl_annotations_
1376inline void XMStoreInt3A
1377(
1378 uint32_t* pDestination,
1379 FXMVECTOR V
1380)
1381{
1382 assert(pDestination);
1383 assert(((uintptr_t)pDestination & 0xF) == 0);
1384#if defined(_XM_NO_INTRINSICS_)
1385 pDestination[0] = V.vector4_u32[0];
1386 pDestination[1] = V.vector4_u32[1];
1387 pDestination[2] = V.vector4_u32[2];
1388#elif defined(_XM_ARM_NEON_INTRINSICS_)
1389 __n64 VL = vget_low_u32(V);
1390 vst1_u32_ex( pDestination, VL, 64 );
1391 vst1q_lane_u32( pDestination+2, V, 2 );
1392#elif defined(_XM_SSE_INTRINSICS_)
1393 XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1394 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1395 _mm_store_ss( reinterpret_cast<float*>(&pDestination[2]), T );
1396#else // _XM_VMX128_INTRINSICS_
1397#endif // _XM_VMX128_INTRINSICS_
1398}
1399
1400//------------------------------------------------------------------------------
1401_Use_decl_annotations_
1402inline void XMStoreFloat3
1403(
1404 XMFLOAT3* pDestination,
1405 FXMVECTOR V
1406)
1407{
1408 assert(pDestination);
1409#if defined(_XM_NO_INTRINSICS_)
1410 pDestination->x = V.vector4_f32[0];
1411 pDestination->y = V.vector4_f32[1];
1412 pDestination->z = V.vector4_f32[2];
1413#elif defined(_XM_ARM_NEON_INTRINSICS_)
1414 __n64 VL = vget_low_f32(V);
1415 vst1_f32( reinterpret_cast<float*>(pDestination), VL );
1416 vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
1417#elif defined(_XM_SSE_INTRINSICS_)
1418 XMVECTOR T1 = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
1419 XMVECTOR T2 = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1420 _mm_store_ss( &pDestination->x, V );
1421 _mm_store_ss( &pDestination->y, T1 );
1422 _mm_store_ss( &pDestination->z, T2 );
1423#else // _XM_VMX128_INTRINSICS_
1424#endif // _XM_VMX128_INTRINSICS_
1425}
1426
1427//------------------------------------------------------------------------------
1428_Use_decl_annotations_
1429inline void XMStoreFloat3A
1430(
1431 XMFLOAT3A* pDestination,
1432 FXMVECTOR V
1433)
1434{
1435 assert(pDestination);
1436 assert(((uintptr_t)pDestination & 0xF) == 0);
1437#if defined(_XM_NO_INTRINSICS_)
1438 pDestination->x = V.vector4_f32[0];
1439 pDestination->y = V.vector4_f32[1];
1440 pDestination->z = V.vector4_f32[2];
1441#elif defined(_XM_ARM_NEON_INTRINSICS_)
1442 __n64 VL = vget_low_f32(V);
1443 vst1_f32_ex( reinterpret_cast<float*>(pDestination), VL, 64 );
1444 vst1q_lane_f32( reinterpret_cast<float*>(pDestination)+2, V, 2 );
1445#elif defined(_XM_SSE_INTRINSICS_)
1446 XMVECTOR T = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
1447 _mm_storel_epi64( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1448 _mm_store_ss( &pDestination->z, T );
1449#else // _XM_VMX128_INTRINSICS_
1450#endif // _XM_VMX128_INTRINSICS_
1451}
1452
1453//------------------------------------------------------------------------------
1454_Use_decl_annotations_
1455inline void XMStoreSInt3
1456(
1457 XMINT3* pDestination,
1458 FXMVECTOR V
1459)
1460{
1461 assert(pDestination);
1462#if defined(_XM_NO_INTRINSICS_)
1463 pDestination->x = (int32_t)V.vector4_f32[0];
1464 pDestination->y = (int32_t)V.vector4_f32[1];
1465 pDestination->z = (int32_t)V.vector4_f32[2];
1466#elif defined(_XM_ARM_NEON_INTRINSICS_)
1467 __n128 v = vcvtq_s32_f32(V);
1468 __n64 vL = vget_low_s32(v);
1469 vst1_s32( reinterpret_cast<int32_t*>(pDestination), vL );
1470 vst1q_lane_s32( reinterpret_cast<int32_t*>(pDestination)+2, v, 2 );
1471#elif defined(_XM_SSE_INTRINSICS_)
1472 // In case of positive overflow, detect it
1473 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
1474 // Float to int conversion
1475 __m128i vResulti = _mm_cvttps_epi32(V);
1476 // If there was positive overflow, set to 0x7FFFFFFF
1477 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
1478 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
1479 vOverflow = _mm_or_ps(vOverflow,vResult);
1480 // Write 3 uints
1481 XMVECTOR T1 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(1,1,1,1));
1482 XMVECTOR T2 = XM_PERMUTE_PS(vOverflow,_MM_SHUFFLE(2,2,2,2));
1483 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vOverflow );
1484 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
1485 _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
1486#else // _XM_VMX128_INTRINSICS_
1487#endif // _XM_VMX128_INTRINSICS_
1488}
1489
1490//------------------------------------------------------------------------------
1491_Use_decl_annotations_
1492inline void XMStoreUInt3
1493(
1494 XMUINT3* pDestination,
1495 FXMVECTOR V
1496)
1497{
1498 assert(pDestination);
1499#if defined(_XM_NO_INTRINSICS_)
1500 pDestination->x = (uint32_t)V.vector4_f32[0];
1501 pDestination->y = (uint32_t)V.vector4_f32[1];
1502 pDestination->z = (uint32_t)V.vector4_f32[2];
1503#elif defined(_XM_ARM_NEON_INTRINSICS_)
1504 __n128 v = vcvtq_u32_f32(V);
1505 __n64 vL = vget_low_u32(v);
1506 vst1_u32( reinterpret_cast<uint32_t*>(pDestination), vL );
1507 vst1q_lane_u32( reinterpret_cast<uint32_t*>(pDestination)+2, v, 2 );
1508#elif defined(_XM_SSE_INTRINSICS_)
1509 // Clamp to >=0
1510 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1511 // Any numbers that are too big, set to 0xFFFFFFFFU
1512 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
1513 XMVECTOR vValue = g_XMUnsignedFix;
1514 // Too large for a signed integer?
1515 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
1516 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
1517 vValue = _mm_and_ps(vValue,vMask);
1518 // Perform fixup only on numbers too large (Keeps low bit precision)
1519 vResult = _mm_sub_ps(vResult,vValue);
1520 __m128i vResulti = _mm_cvttps_epi32(vResult);
1521 // Convert from signed to unsigned pnly if greater than 0x80000000
1522 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
1523 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
1524 // On those that are too large, set to 0xFFFFFFFF
1525 vResult = _mm_or_ps(vResult,vOverflow);
1526 // Write 3 uints
1527 XMVECTOR T1 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
1528 XMVECTOR T2 = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(2,2,2,2));
1529 _mm_store_ss( reinterpret_cast<float*>(&pDestination->x), vResult );
1530 _mm_store_ss( reinterpret_cast<float*>(&pDestination->y), T1 );
1531 _mm_store_ss( reinterpret_cast<float*>(&pDestination->z), T2 );
1532#else // _XM_VMX128_INTRINSICS_
1533#endif // _XM_VMX128_INTRINSICS_
1534}
1535
1536//------------------------------------------------------------------------------
1537_Use_decl_annotations_
1538inline void XMStoreInt4
1539(
1540 uint32_t* pDestination,
1541 FXMVECTOR V
1542)
1543{
1544 assert(pDestination);
1545#if defined(_XM_NO_INTRINSICS_)
1546 pDestination[0] = V.vector4_u32[0];
1547 pDestination[1] = V.vector4_u32[1];
1548 pDestination[2] = V.vector4_u32[2];
1549 pDestination[3] = V.vector4_u32[3];
1550#elif defined(_XM_ARM_NEON_INTRINSICS_)
1551 vst1q_u32( pDestination, V );
1552#elif defined(_XM_SSE_INTRINSICS_)
1553 _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1554#else // _XM_VMX128_INTRINSICS_
1555#endif // _XM_VMX128_INTRINSICS_
1556}
1557
1558//------------------------------------------------------------------------------
1559_Use_decl_annotations_
1560inline void XMStoreInt4A
1561(
1562 uint32_t* pDestination,
1563 FXMVECTOR V
1564)
1565{
1566 assert(pDestination);
1567 assert(((uintptr_t)pDestination & 0xF) == 0);
1568#if defined(_XM_NO_INTRINSICS_)
1569 pDestination[0] = V.vector4_u32[0];
1570 pDestination[1] = V.vector4_u32[1];
1571 pDestination[2] = V.vector4_u32[2];
1572 pDestination[3] = V.vector4_u32[3];
1573#elif defined(_XM_ARM_NEON_INTRINSICS_)
1574 vst1q_u32_ex( pDestination, V, 128 );
1575#elif defined(_XM_SSE_INTRINSICS_)
1576 _mm_store_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(V) );
1577#else // _XM_VMX128_INTRINSICS_
1578#endif // _XM_VMX128_INTRINSICS_
1579}
1580
1581
1582//------------------------------------------------------------------------------
1583_Use_decl_annotations_
1584inline void XMStoreFloat4
1585(
1586 XMFLOAT4* pDestination,
1587 FXMVECTOR V
1588)
1589{
1590 assert(pDestination);
1591#if defined(_XM_NO_INTRINSICS_)
1592 pDestination->x = V.vector4_f32[0];
1593 pDestination->y = V.vector4_f32[1];
1594 pDestination->z = V.vector4_f32[2];
1595 pDestination->w = V.vector4_f32[3];
1596#elif defined(_XM_ARM_NEON_INTRINSICS_)
1597 vst1q_f32( reinterpret_cast<float*>(pDestination), V );
1598#elif defined(_XM_SSE_INTRINSICS_)
1599 _mm_storeu_ps( &pDestination->x, V );
1600#else // _XM_VMX128_INTRINSICS_
1601#endif // _XM_VMX128_INTRINSICS_
1602}
1603
1604//------------------------------------------------------------------------------
1605_Use_decl_annotations_
1606inline void XMStoreFloat4A
1607(
1608 XMFLOAT4A* pDestination,
1609 FXMVECTOR V
1610)
1611{
1612 assert(pDestination);
1613 assert(((uintptr_t)pDestination & 0xF) == 0);
1614#if defined(_XM_NO_INTRINSICS_)
1615 pDestination->x = V.vector4_f32[0];
1616 pDestination->y = V.vector4_f32[1];
1617 pDestination->z = V.vector4_f32[2];
1618 pDestination->w = V.vector4_f32[3];
1619#elif defined(_XM_ARM_NEON_INTRINSICS_)
1620 vst1q_f32_ex( reinterpret_cast<float*>(pDestination), V, 128 );
1621#elif defined(_XM_SSE_INTRINSICS_)
1622 _mm_store_ps( &pDestination->x, V );
1623#else // _XM_VMX128_INTRINSICS_
1624#endif // _XM_VMX128_INTRINSICS_
1625}
1626
1627
1628//------------------------------------------------------------------------------
1629_Use_decl_annotations_
1630inline void XMStoreSInt4
1631(
1632 XMINT4* pDestination,
1633 FXMVECTOR V
1634)
1635{
1636 assert(pDestination);
1637#if defined(_XM_NO_INTRINSICS_)
1638 pDestination->x = (int32_t)V.vector4_f32[0];
1639 pDestination->y = (int32_t)V.vector4_f32[1];
1640 pDestination->z = (int32_t)V.vector4_f32[2];
1641 pDestination->w = (int32_t)V.vector4_f32[3];
1642#elif defined(_XM_ARM_NEON_INTRINSICS_)
1643 __n128 v = vcvtq_s32_f32(V);
1644 vst1q_s32( reinterpret_cast<int32_t*>(pDestination), v );
1645#elif defined(_XM_SSE_INTRINSICS_)
1646 // In case of positive overflow, detect it
1647 XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
1648 // Float to int conversion
1649 __m128i vResulti = _mm_cvttps_epi32(V);
1650 // If there was positive overflow, set to 0x7FFFFFFF
1651 XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
1652 vOverflow = _mm_andnot_ps(vOverflow,_mm_castsi128_ps(vResulti));
1653 vOverflow = _mm_or_ps(vOverflow,vResult);
1654 _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vOverflow) );
1655#else // _XM_VMX128_INTRINSICS_
1656#endif // _XM_VMX128_INTRINSICS_
1657}
1658
1659//------------------------------------------------------------------------------
1660_Use_decl_annotations_
1661inline void XMStoreUInt4
1662(
1663 XMUINT4* pDestination,
1664 FXMVECTOR V
1665)
1666{
1667 assert(pDestination);
1668#if defined(_XM_NO_INTRINSICS_)
1669 pDestination->x = (uint32_t)V.vector4_f32[0];
1670 pDestination->y = (uint32_t)V.vector4_f32[1];
1671 pDestination->z = (uint32_t)V.vector4_f32[2];
1672 pDestination->w = (uint32_t)V.vector4_f32[3];
1673#elif defined(_XM_ARM_NEON_INTRINSICS_)
1674 __n128 v = vcvtq_u32_f32(V);
1675 vst1q_u32( reinterpret_cast<uint32_t*>(pDestination), v );
1676#elif defined(_XM_SSE_INTRINSICS_)
1677 // Clamp to >=0
1678 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
1679 // Any numbers that are too big, set to 0xFFFFFFFFU
1680 XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
1681 XMVECTOR vValue = g_XMUnsignedFix;
1682 // Too large for a signed integer?
1683 XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
1684 // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
1685 vValue = _mm_and_ps(vValue,vMask);
1686 // Perform fixup only on numbers too large (Keeps low bit precision)
1687 vResult = _mm_sub_ps(vResult,vValue);
1688 __m128i vResulti = _mm_cvttps_epi32(vResult);
1689 // Convert from signed to unsigned pnly if greater than 0x80000000
1690 vMask = _mm_and_ps(vMask,g_XMNegativeZero);
1691 vResult = _mm_xor_ps(_mm_castsi128_ps(vResulti),vMask);
1692 // On those that are too large, set to 0xFFFFFFFF
1693 vResult = _mm_or_ps(vResult,vOverflow);
1694 _mm_storeu_si128( reinterpret_cast<__m128i*>(pDestination), _mm_castps_si128(vResult) );
1695#else // _XM_VMX128_INTRINSICS_
1696#endif // _XM_VMX128_INTRINSICS_
1697}
1698
1699//------------------------------------------------------------------------------
1700_Use_decl_annotations_
1701inline void XMStoreFloat3x3
1702(
1703 XMFLOAT3X3* pDestination,
1704 CXMMATRIX M
1705)
1706{
1707 assert(pDestination);
1708#if defined(_XM_NO_INTRINSICS_)
1709
1710 pDestination->m[0][0] = M.r[0].vector4_f32[0];
1711 pDestination->m[0][1] = M.r[0].vector4_f32[1];
1712 pDestination->m[0][2] = M.r[0].vector4_f32[2];
1713
1714 pDestination->m[1][0] = M.r[1].vector4_f32[0];
1715 pDestination->m[1][1] = M.r[1].vector4_f32[1];
1716 pDestination->m[1][2] = M.r[1].vector4_f32[2];
1717
1718 pDestination->m[2][0] = M.r[2].vector4_f32[0];
1719 pDestination->m[2][1] = M.r[2].vector4_f32[1];
1720 pDestination->m[2][2] = M.r[2].vector4_f32[2];
1721
1722#elif defined(_XM_ARM_NEON_INTRINSICS_)
1723 __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 );
1724 __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
1725 vst1q_f32( &pDestination->m[0][0], T2 );
1726
1727 T1 = vextq_f32( M.r[1], M.r[1], 1 );
1728 T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
1729 vst1q_f32( &pDestination->m[1][1], T2 );
1730
1731 vst1q_lane_f32( &pDestination->m[2][2], M.r[2], 2 );
1732#elif defined(_XM_SSE_INTRINSICS_)
1733 XMVECTOR vTemp1 = M.r[0];
1734 XMVECTOR vTemp2 = M.r[1];
1735 XMVECTOR vTemp3 = M.r[2];
1736 XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
1737 vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
1738 _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
1739 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
1740 _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
1741 vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(2,2,2,2));
1742 _mm_store_ss(&pDestination->m[2][2],vTemp3);
1743#else // _XM_VMX128_INTRINSICS_
1744#endif // _XM_VMX128_INTRINSICS_
1745}
1746
1747
1748//------------------------------------------------------------------------------
1749_Use_decl_annotations_
1750inline void XMStoreFloat4x3
1751(
1752 XMFLOAT4X3* pDestination,
1753 CXMMATRIX M
1754)
1755{
1756 assert(pDestination);
1757#if defined(_XM_NO_INTRINSICS_)
1758
1759 pDestination->m[0][0] = M.r[0].vector4_f32[0];
1760 pDestination->m[0][1] = M.r[0].vector4_f32[1];
1761 pDestination->m[0][2] = M.r[0].vector4_f32[2];
1762
1763 pDestination->m[1][0] = M.r[1].vector4_f32[0];
1764 pDestination->m[1][1] = M.r[1].vector4_f32[1];
1765 pDestination->m[1][2] = M.r[1].vector4_f32[2];
1766
1767 pDestination->m[2][0] = M.r[2].vector4_f32[0];
1768 pDestination->m[2][1] = M.r[2].vector4_f32[1];
1769 pDestination->m[2][2] = M.r[2].vector4_f32[2];
1770
1771 pDestination->m[3][0] = M.r[3].vector4_f32[0];
1772 pDestination->m[3][1] = M.r[3].vector4_f32[1];
1773 pDestination->m[3][2] = M.r[3].vector4_f32[2];
1774
1775#elif defined(_XM_ARM_NEON_INTRINSICS_)
1776 __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 );
1777 __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
1778 vst1q_f32( &pDestination->m[0][0], T2 );
1779
1780 T1 = vextq_f32( M.r[1], M.r[1], 1 );
1781 T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
1782 vst1q_f32( &pDestination->m[1][1], T2 );
1783
1784 T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
1785 T2 = vextq_f32( T1, M.r[3], 3 );
1786 vst1q_f32( &pDestination->m[2][2], T2 );
1787#elif defined(_XM_SSE_INTRINSICS_)
1788 XMVECTOR vTemp1 = M.r[0];
1789 XMVECTOR vTemp2 = M.r[1];
1790 XMVECTOR vTemp3 = M.r[2];
1791 XMVECTOR vTemp4 = M.r[3];
1792 XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
1793 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
1794 vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
1795 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
1796 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
1797 _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
1798 _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
1799 _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
1800#else // _XM_VMX128_INTRINSICS_
1801#endif // _XM_VMX128_INTRINSICS_
1802}
1803
1804//------------------------------------------------------------------------------
1805_Use_decl_annotations_
1806inline void XMStoreFloat4x3A
1807(
1808 XMFLOAT4X3A* pDestination,
1809 CXMMATRIX M
1810)
1811{
1812 assert(pDestination);
1813 assert(((uintptr_t)pDestination & 0xF) == 0);
1814#if defined(_XM_NO_INTRINSICS_)
1815
1816 pDestination->m[0][0] = M.r[0].vector4_f32[0];
1817 pDestination->m[0][1] = M.r[0].vector4_f32[1];
1818 pDestination->m[0][2] = M.r[0].vector4_f32[2];
1819
1820 pDestination->m[1][0] = M.r[1].vector4_f32[0];
1821 pDestination->m[1][1] = M.r[1].vector4_f32[1];
1822 pDestination->m[1][2] = M.r[1].vector4_f32[2];
1823
1824 pDestination->m[2][0] = M.r[2].vector4_f32[0];
1825 pDestination->m[2][1] = M.r[2].vector4_f32[1];
1826 pDestination->m[2][2] = M.r[2].vector4_f32[2];
1827
1828 pDestination->m[3][0] = M.r[3].vector4_f32[0];
1829 pDestination->m[3][1] = M.r[3].vector4_f32[1];
1830 pDestination->m[3][2] = M.r[3].vector4_f32[2];
1831
1832#elif defined(_XM_ARM_NEON_INTRINSICS_)
1833 __n128 T1 = vextq_f32( M.r[0], M.r[1], 1 );
1834 __n128 T2 = vbslq_f32( g_XMMask3, M.r[0], T1 );
1835 vst1q_f32_ex( &pDestination->m[0][0], T2, 128 );
1836
1837 T1 = vextq_f32( M.r[1], M.r[1], 1 );
1838 T2 = vcombine_f32( vget_low_f32(T1), vget_low_f32(M.r[2]) );
1839 vst1q_f32_ex( &pDestination->m[1][1], T2, 128 );
1840
1841 T1 = vdupq_lane_f32( vget_high_f32( M.r[2] ), 0 );
1842 T2 = vextq_f32( T1, M.r[3], 3 );
1843 vst1q_f32_ex( &pDestination->m[2][2], T2, 128 );
1844#elif defined(_XM_SSE_INTRINSICS_)
1845 // x1,y1,z1,w1
1846 XMVECTOR vTemp1 = M.r[0];
1847 // x2,y2,z2,w2
1848 XMVECTOR vTemp2 = M.r[1];
1849 // x3,y3,z3,w3
1850 XMVECTOR vTemp3 = M.r[2];
1851 // x4,y4,z4,w4
1852 XMVECTOR vTemp4 = M.r[3];
1853 // z1,z1,x2,y2
1854 XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
1855 // y2,z2,x3,y3 (Final)
1856 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
1857 // x1,y1,z1,x2 (Final)
1858 vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
1859 // z3,z3,x4,x4
1860 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
1861 // z3,x4,y4,z4 (Final)
1862 vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
1863 // Store in 3 operations
1864 _mm_store_ps(&pDestination->m[0][0],vTemp1);
1865 _mm_store_ps(&pDestination->m[1][1],vTemp2);
1866 _mm_store_ps(&pDestination->m[2][2],vTemp3);
1867#else // _XM_VMX128_INTRINSICS_
1868#endif // _XM_VMX128_INTRINSICS_
1869}
1870
1871
1872//------------------------------------------------------------------------------
1873_Use_decl_annotations_
1874inline void XMStoreFloat4x4
1875(
1876 XMFLOAT4X4* pDestination,
1877 CXMMATRIX M
1878)
1879{
1880 assert(pDestination);
1881#if defined(_XM_NO_INTRINSICS_)
1882
1883 pDestination->m[0][0] = M.r[0].vector4_f32[0];
1884 pDestination->m[0][1] = M.r[0].vector4_f32[1];
1885 pDestination->m[0][2] = M.r[0].vector4_f32[2];
1886 pDestination->m[0][3] = M.r[0].vector4_f32[3];
1887
1888 pDestination->m[1][0] = M.r[1].vector4_f32[0];
1889 pDestination->m[1][1] = M.r[1].vector4_f32[1];
1890 pDestination->m[1][2] = M.r[1].vector4_f32[2];
1891 pDestination->m[1][3] = M.r[1].vector4_f32[3];
1892
1893 pDestination->m[2][0] = M.r[2].vector4_f32[0];
1894 pDestination->m[2][1] = M.r[2].vector4_f32[1];
1895 pDestination->m[2][2] = M.r[2].vector4_f32[2];
1896 pDestination->m[2][3] = M.r[2].vector4_f32[3];
1897
1898 pDestination->m[3][0] = M.r[3].vector4_f32[0];
1899 pDestination->m[3][1] = M.r[3].vector4_f32[1];
1900 pDestination->m[3][2] = M.r[3].vector4_f32[2];
1901 pDestination->m[3][3] = M.r[3].vector4_f32[3];
1902
1903#elif defined(_XM_ARM_NEON_INTRINSICS_)
1904 vst1q_f32( reinterpret_cast<float*>(&pDestination->_11), M.r[0] );
1905 vst1q_f32( reinterpret_cast<float*>(&pDestination->_21), M.r[1] );
1906 vst1q_f32( reinterpret_cast<float*>(&pDestination->_31), M.r[2] );
1907 vst1q_f32( reinterpret_cast<float*>(&pDestination->_41), M.r[3] );
1908#elif defined(_XM_SSE_INTRINSICS_)
1909 _mm_storeu_ps( &pDestination->_11, M.r[0] );
1910 _mm_storeu_ps( &pDestination->_21, M.r[1] );
1911 _mm_storeu_ps( &pDestination->_31, M.r[2] );
1912 _mm_storeu_ps( &pDestination->_41, M.r[3] );
1913#else // _XM_VMX128_INTRINSICS_
1914#endif // _XM_VMX128_INTRINSICS_
1915}
1916
1917//------------------------------------------------------------------------------
1918_Use_decl_annotations_
1919inline void XMStoreFloat4x4A
1920(
1921 XMFLOAT4X4A* pDestination,
1922 CXMMATRIX M
1923)
1924{
1925 assert(pDestination);
1926 assert(((uintptr_t)pDestination & 0xF) == 0);
1927#if defined(_XM_NO_INTRINSICS_)
1928
1929 pDestination->m[0][0] = M.r[0].vector4_f32[0];
1930 pDestination->m[0][1] = M.r[0].vector4_f32[1];
1931 pDestination->m[0][2] = M.r[0].vector4_f32[2];
1932 pDestination->m[0][3] = M.r[0].vector4_f32[3];
1933
1934 pDestination->m[1][0] = M.r[1].vector4_f32[0];
1935 pDestination->m[1][1] = M.r[1].vector4_f32[1];
1936 pDestination->m[1][2] = M.r[1].vector4_f32[2];
1937 pDestination->m[1][3] = M.r[1].vector4_f32[3];
1938
1939 pDestination->m[2][0] = M.r[2].vector4_f32[0];
1940 pDestination->m[2][1] = M.r[2].vector4_f32[1];
1941 pDestination->m[2][2] = M.r[2].vector4_f32[2];
1942 pDestination->m[2][3] = M.r[2].vector4_f32[3];
1943
1944 pDestination->m[3][0] = M.r[3].vector4_f32[0];
1945 pDestination->m[3][1] = M.r[3].vector4_f32[1];
1946 pDestination->m[3][2] = M.r[3].vector4_f32[2];
1947 pDestination->m[3][3] = M.r[3].vector4_f32[3];
1948
1949#elif defined(_XM_ARM_NEON_INTRINSICS_)
1950 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_11), M.r[0], 128 );
1951 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_21), M.r[1], 128 );
1952 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_31), M.r[2], 128 );
1953 vst1q_f32_ex( reinterpret_cast<float*>(&pDestination->_41), M.r[3], 128 );
1954#elif defined(_XM_SSE_INTRINSICS_)
1955 _mm_store_ps( &pDestination->_11, M.r[0] );
1956 _mm_store_ps( &pDestination->_21, M.r[1] );
1957 _mm_store_ps( &pDestination->_31, M.r[2] );
1958 _mm_store_ps( &pDestination->_41, M.r[3] );
1959#else // _XM_VMX128_INTRINSICS_
1960#endif // _XM_VMX128_INTRINSICS_
1961}
1962