the game where you go into mines and start crafting! but for consoles (forked directly from smartcmd's github)
1//-------------------------------------------------------------------------------------
2// DirectXMathVector.inl -- SIMD C++ Math library
3//
4// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF
5// ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO
6// THE IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A
7// PARTICULAR PURPOSE.
8//
9// Copyright (c) Microsoft Corporation. All rights reserved.
10//-------------------------------------------------------------------------------------
11
12#ifdef _MSC_VER
13#pragma once
14#endif
15
16#if defined(_XM_NO_INTRINSICS_)
17#define XMISNAN(x) ((*(uint32_t*)&(x) & 0x7F800000) == 0x7F800000 && (*(uint32_t*)&(x) & 0x7FFFFF) != 0)
18#define XMISINF(x) ((*(uint32_t*)&(x) & 0x7FFFFFFF) == 0x7F800000)
19#endif
20
21/****************************************************************************
22 *
23 * General Vector
24 *
25 ****************************************************************************/
26
27//------------------------------------------------------------------------------
28// Assignment operations
29//------------------------------------------------------------------------------
30
31//------------------------------------------------------------------------------
32// Return a vector with all elements equaling zero
33inline XMVECTOR XMVectorZero()
34{
35#if defined(_XM_NO_INTRINSICS_)
36 XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
37 return vResult;
38#elif defined(_XM_ARM_NEON_INTRINSICS_)
39 return vdupq_n_u32(0);
40#elif defined(_XM_SSE_INTRINSICS_)
41 return _mm_setzero_ps();
42#else // _XM_VMX128_INTRINSICS_
43#endif // _XM_VMX128_INTRINSICS_
44}
45
46//------------------------------------------------------------------------------
47// Initialize a vector with four floating point values
48inline XMVECTOR XMVectorSet
49(
50 float x,
51 float y,
52 float z,
53 float w
54)
55{
56#if defined(_XM_NO_INTRINSICS_)
57 XMVECTORF32 vResult = {x,y,z,w};
58 return vResult.v;
59#elif defined(_XM_ARM_NEON_INTRINSICS_)
60 __n64 V0 = vcreate_f32(((uint64_t)*(const uint32_t *)&x) | ((uint64_t)(*(const uint32_t *)&y) << 32));
61 __n64 V1 = vcreate_f32(((uint64_t)*(const uint32_t *)&z) | ((uint64_t)(*(const uint32_t *)&w) << 32));
62 return vcombine_f32(V0, V1);
63#elif defined(_XM_SSE_INTRINSICS_)
64 return _mm_set_ps( w, z, y, x );
65#else // _XM_VMX128_INTRINSICS_
66#endif // _XM_VMX128_INTRINSICS_
67}
68
69//------------------------------------------------------------------------------
70// Initialize a vector with four integer values
71inline XMVECTOR XMVectorSetInt
72(
73 uint32_t x,
74 uint32_t y,
75 uint32_t z,
76 uint32_t w
77)
78{
79#if defined(_XM_NO_INTRINSICS_)
80 XMVECTORU32 vResult = {x,y,z,w};
81 return vResult.v;
82#elif defined(_XM_ARM_NEON_INTRINSICS_)
83 __n64 V0 = vcreate_u32(((uint64_t)x) | ((uint64_t)y << 32));
84 __n64 V1 = vcreate_u32(((uint64_t)z) | ((uint64_t)w << 32));
85 return vcombine_u32(V0, V1);
86#elif defined(_XM_SSE_INTRINSICS_)
87 __m128i V = _mm_set_epi32( w, z, y, x );
88 return reinterpret_cast<__m128 *>(&V)[0];
89#else // _XM_VMX128_INTRINSICS_
90#endif // _XM_VMX128_INTRINSICS_
91}
92
93//------------------------------------------------------------------------------
94// Initialize a vector with a replicated floating point value
95inline XMVECTOR XMVectorReplicate
96(
97 float Value
98)
99{
100#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
101 XMVECTORF32 vResult = {Value,Value,Value,Value};
102 return vResult.v;
103#elif defined(_XM_ARM_NEON_INTRINSICS_)
104 return vdupq_n_f32( Value );
105#elif defined(_XM_SSE_INTRINSICS_)
106 return _mm_set_ps1( Value );
107#else // _XM_VMX128_INTRINSICS_
108#endif // _XM_VMX128_INTRINSICS_
109}
110
111//------------------------------------------------------------------------------
112// Initialize a vector with a replicated floating point value passed by pointer
113_Use_decl_annotations_
114inline XMVECTOR XMVectorReplicatePtr
115(
116 const float *pValue
117)
118{
119#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
120 float Value = pValue[0];
121 XMVECTORF32 vResult = {Value,Value,Value,Value};
122 return vResult.v;
123#elif defined(_XM_ARM_NEON_INTRINSICS_)
124 return vld1q_dup_f32( pValue );
125#elif defined(_XM_SSE_INTRINSICS_)
126 return _mm_load_ps1( pValue );
127#else // _XM_VMX128_INTRINSICS_
128#endif // _XM_VMX128_INTRINSICS_
129}
130
131//------------------------------------------------------------------------------
132// Initialize a vector with a replicated integer value
133inline XMVECTOR XMVectorReplicateInt
134(
135 uint32_t Value
136)
137{
138#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
139 XMVECTORU32 vResult = {Value,Value,Value,Value};
140 return vResult.v;
141#elif defined(_XM_ARM_NEON_INTRINSICS_)
142 return vdupq_n_u32( Value );
143#elif defined(_XM_SSE_INTRINSICS_)
144 __m128i vTemp = _mm_set1_epi32( Value );
145 return _mm_castsi128_ps(vTemp);
146#else // _XM_VMX128_INTRINSICS_
147#endif // _XM_VMX128_INTRINSICS_
148}
149
150//------------------------------------------------------------------------------
151// Initialize a vector with a replicated integer value passed by pointer
152_Use_decl_annotations_
153inline XMVECTOR XMVectorReplicateIntPtr
154(
155 const uint32_t *pValue
156)
157{
158#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
159 uint32_t Value = pValue[0];
160 XMVECTORU32 vResult = {Value,Value,Value,Value};
161 return vResult.v;
162#elif defined(_XM_ARM_NEON_INTRINSICS_)
163 return vld1q_dup_u32(pValue);
164#elif defined(_XM_SSE_INTRINSICS_)
165 return _mm_load_ps1(reinterpret_cast<const float *>(pValue));
166#else // _XM_VMX128_INTRINSICS_
167#endif // _XM_VMX128_INTRINSICS_
168}
169
170//------------------------------------------------------------------------------
171// Initialize a vector with all bits set (true mask)
172inline XMVECTOR XMVectorTrueInt()
173{
174#if defined(_XM_NO_INTRINSICS_)
175 XMVECTORU32 vResult = {0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU,0xFFFFFFFFU};
176 return vResult.v;
177#elif defined(_XM_ARM_NEON_INTRINSICS_)
178 return vdupq_n_s32(-1);
179#elif defined(_XM_SSE_INTRINSICS_)
180 __m128i V = _mm_set1_epi32(-1);
181 return reinterpret_cast<__m128 *>(&V)[0];
182#else // _XM_VMX128_INTRINSICS_
183#endif // _XM_VMX128_INTRINSICS_
184}
185
186//------------------------------------------------------------------------------
187// Initialize a vector with all bits clear (false mask)
188inline XMVECTOR XMVectorFalseInt()
189{
190#if defined(_XM_NO_INTRINSICS_)
191 XMVECTOR vResult = {0.0f,0.0f,0.0f,0.0f};
192 return vResult;
193#elif defined(_XM_ARM_NEON_INTRINSICS_)
194 return vdupq_n_u32(0);
195#elif defined(_XM_SSE_INTRINSICS_)
196 return _mm_setzero_ps();
197#else // _XM_VMX128_INTRINSICS_
198#endif // _XM_VMX128_INTRINSICS_
199}
200
201//------------------------------------------------------------------------------
202// Replicate the x component of the vector
203inline XMVECTOR XMVectorSplatX
204(
205 FXMVECTOR V
206)
207{
208#if defined(_XM_NO_INTRINSICS_)
209 XMVECTOR vResult;
210 vResult.vector4_f32[0] =
211 vResult.vector4_f32[1] =
212 vResult.vector4_f32[2] =
213 vResult.vector4_f32[3] = V.vector4_f32[0];
214 return vResult;
215#elif defined(_XM_ARM_NEON_INTRINSICS_)
216 return vdupq_lane_f32( vget_low_f32( V ), 0 );
217#elif defined(_XM_SSE_INTRINSICS_)
218 return XM_PERMUTE_PS( V, _MM_SHUFFLE(0, 0, 0, 0) );
219#else // _XM_VMX128_INTRINSICS_
220#endif // _XM_VMX128_INTRINSICS_
221}
222
223//------------------------------------------------------------------------------
224// Replicate the y component of the vector
225inline XMVECTOR XMVectorSplatY
226(
227 FXMVECTOR V
228)
229{
230#if defined(_XM_NO_INTRINSICS_)
231 XMVECTOR vResult;
232 vResult.vector4_f32[0] =
233 vResult.vector4_f32[1] =
234 vResult.vector4_f32[2] =
235 vResult.vector4_f32[3] = V.vector4_f32[1];
236 return vResult;
237#elif defined(_XM_ARM_NEON_INTRINSICS_)
238 return vdupq_lane_f32( vget_low_f32( V ), 1 );
239#elif defined(_XM_SSE_INTRINSICS_)
240 return XM_PERMUTE_PS( V, _MM_SHUFFLE(1, 1, 1, 1) );
241#else // _XM_VMX128_INTRINSICS_
242#endif // _XM_VMX128_INTRINSICS_
243}
244
245//------------------------------------------------------------------------------
246// Replicate the z component of the vector
247inline XMVECTOR XMVectorSplatZ
248(
249 FXMVECTOR V
250)
251{
252#if defined(_XM_NO_INTRINSICS_)
253 XMVECTOR vResult;
254 vResult.vector4_f32[0] =
255 vResult.vector4_f32[1] =
256 vResult.vector4_f32[2] =
257 vResult.vector4_f32[3] = V.vector4_f32[2];
258 return vResult;
259#elif defined(_XM_ARM_NEON_INTRINSICS_)
260 return vdupq_lane_f32( vget_high_f32( V ), 0 );
261#elif defined(_XM_SSE_INTRINSICS_)
262 return XM_PERMUTE_PS( V, _MM_SHUFFLE(2, 2, 2, 2) );
263#else // _XM_VMX128_INTRINSICS_
264#endif // _XM_VMX128_INTRINSICS_
265}
266
267//------------------------------------------------------------------------------
268// Replicate the w component of the vector
269inline XMVECTOR XMVectorSplatW
270(
271 FXMVECTOR V
272)
273{
274#if defined(_XM_NO_INTRINSICS_)
275 XMVECTOR vResult;
276 vResult.vector4_f32[0] =
277 vResult.vector4_f32[1] =
278 vResult.vector4_f32[2] =
279 vResult.vector4_f32[3] = V.vector4_f32[3];
280 return vResult;
281#elif defined(_XM_ARM_NEON_INTRINSICS_)
282 return vdupq_lane_f32( vget_high_f32( V ), 1 );
283#elif defined(_XM_SSE_INTRINSICS_)
284 return XM_PERMUTE_PS( V, _MM_SHUFFLE(3, 3, 3, 3) );
285#else // _XM_VMX128_INTRINSICS_
286#endif // _XM_VMX128_INTRINSICS_
287}
288
289//------------------------------------------------------------------------------
290// Return a vector of 1.0f,1.0f,1.0f,1.0f
291inline XMVECTOR XMVectorSplatOne()
292{
293#if defined(_XM_NO_INTRINSICS_)
294 XMVECTOR vResult;
295 vResult.vector4_f32[0] =
296 vResult.vector4_f32[1] =
297 vResult.vector4_f32[2] =
298 vResult.vector4_f32[3] = 1.0f;
299 return vResult;
300#elif defined(_XM_ARM_NEON_INTRINSICS_)
301 return vdupq_n_f32(1.0f);
302#elif defined(_XM_SSE_INTRINSICS_)
303 return g_XMOne;
304#else // _XM_VMX128_INTRINSICS_
305#endif // _XM_VMX128_INTRINSICS_
306}
307
308//------------------------------------------------------------------------------
309// Return a vector of INF,INF,INF,INF
310inline XMVECTOR XMVectorSplatInfinity()
311{
312#if defined(_XM_NO_INTRINSICS_)
313 XMVECTOR vResult;
314 vResult.vector4_u32[0] =
315 vResult.vector4_u32[1] =
316 vResult.vector4_u32[2] =
317 vResult.vector4_u32[3] = 0x7F800000;
318 return vResult;
319#elif defined(_XM_ARM_NEON_INTRINSICS_)
320 return vdupq_n_u32(0x7F800000);
321#elif defined(_XM_SSE_INTRINSICS_)
322 return g_XMInfinity;
323#else // _XM_VMX128_INTRINSICS_
324#endif // _XM_VMX128_INTRINSICS_
325}
326
327//------------------------------------------------------------------------------
328// Return a vector of Q_NAN,Q_NAN,Q_NAN,Q_NAN
329inline XMVECTOR XMVectorSplatQNaN()
330{
331#if defined(_XM_NO_INTRINSICS_)
332 XMVECTOR vResult;
333 vResult.vector4_u32[0] =
334 vResult.vector4_u32[1] =
335 vResult.vector4_u32[2] =
336 vResult.vector4_u32[3] = 0x7FC00000;
337 return vResult;
338#elif defined(_XM_ARM_NEON_INTRINSICS_)
339 return vdupq_n_u32(0x7FC00000);
340#elif defined(_XM_SSE_INTRINSICS_)
341 return g_XMQNaN;
342#else // _XM_VMX128_INTRINSICS_
343#endif // _XM_VMX128_INTRINSICS_
344}
345
346//------------------------------------------------------------------------------
347// Return a vector of 1.192092896e-7f,1.192092896e-7f,1.192092896e-7f,1.192092896e-7f
348inline XMVECTOR XMVectorSplatEpsilon()
349{
350#if defined(_XM_NO_INTRINSICS_)
351 XMVECTOR vResult;
352 vResult.vector4_u32[0] =
353 vResult.vector4_u32[1] =
354 vResult.vector4_u32[2] =
355 vResult.vector4_u32[3] = 0x34000000;
356 return vResult;
357#elif defined(_XM_ARM_NEON_INTRINSICS_)
358 return vdupq_n_u32(0x34000000);
359#elif defined(_XM_SSE_INTRINSICS_)
360 return g_XMEpsilon;
361#else // _XM_VMX128_INTRINSICS_
362#endif // _XM_VMX128_INTRINSICS_
363}
364
365//------------------------------------------------------------------------------
366// Return a vector of -0.0f (0x80000000),-0.0f,-0.0f,-0.0f
367inline XMVECTOR XMVectorSplatSignMask()
368{
369#if defined(_XM_NO_INTRINSICS_)
370 XMVECTOR vResult;
371 vResult.vector4_u32[0] =
372 vResult.vector4_u32[1] =
373 vResult.vector4_u32[2] =
374 vResult.vector4_u32[3] = 0x80000000U;
375 return vResult;
376#elif defined(_XM_ARM_NEON_INTRINSICS_)
377 return vdupq_n_u32(0x80000000U);
378#elif defined(_XM_SSE_INTRINSICS_)
379 __m128i V = _mm_set1_epi32( 0x80000000 );
380 return reinterpret_cast<__m128*>(&V)[0];
381#else // _XM_VMX128_INTRINSICS_
382#endif // _XM_VMX128_INTRINSICS_
383}
384
385//------------------------------------------------------------------------------
386// Return a floating point value via an index. This is not a recommended
387// function to use due to performance loss.
388inline float XMVectorGetByIndex(FXMVECTOR V, size_t i)
389{
390 assert( i < 4 );
391 _Analysis_assume_( i < 4 );
392#if defined(_XM_NO_INTRINSICS_)
393 return V.vector4_f32[i];
394#elif defined(_XM_ARM_NEON_INTRINSICS_)
395 return V.n128_f32[i];
396#elif defined(_XM_SSE_INTRINSICS_)
397 return V.m128_f32[i];
398#else // _XM_VMX128_INTRINSICS_
399#endif // _XM_VMX128_INTRINSICS_
400}
401
402//------------------------------------------------------------------------------
403// Return the X component in an FPU register.
404inline float XMVectorGetX(FXMVECTOR V)
405{
406#if defined(_XM_NO_INTRINSICS_)
407 return V.vector4_f32[0];
408#elif defined(_XM_ARM_NEON_INTRINSICS_)
409 return vgetq_lane_f32(V, 0);
410#elif defined(_XM_SSE_INTRINSICS_)
411 return _mm_cvtss_f32(V);
412#else // _XM_VMX128_INTRINSICS_
413#endif // _XM_VMX128_INTRINSICS_
414}
415
416// Return the Y component in an FPU register.
417inline float XMVectorGetY(FXMVECTOR V)
418{
419#if defined(_XM_NO_INTRINSICS_)
420 return V.vector4_f32[1];
421#elif defined(_XM_ARM_NEON_INTRINSICS_)
422 return vgetq_lane_f32(V, 1);
423#elif defined(_XM_SSE_INTRINSICS_)
424 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
425 return _mm_cvtss_f32(vTemp);
426#else // _XM_VMX128_INTRINSICS_
427#endif // _XM_VMX128_INTRINSICS_
428}
429
430// Return the Z component in an FPU register.
431inline float XMVectorGetZ(FXMVECTOR V)
432{
433#if defined(_XM_NO_INTRINSICS_)
434 return V.vector4_f32[2];
435#elif defined(_XM_ARM_NEON_INTRINSICS_)
436 return vgetq_lane_f32(V, 2);
437#elif defined(_XM_SSE_INTRINSICS_)
438 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
439 return _mm_cvtss_f32(vTemp);
440#else // _XM_VMX128_INTRINSICS_
441#endif // _XM_VMX128_INTRINSICS_
442}
443
444// Return the W component in an FPU register.
445inline float XMVectorGetW(FXMVECTOR V)
446{
447#if defined(_XM_NO_INTRINSICS_)
448 return V.vector4_f32[3];
449#elif defined(_XM_ARM_NEON_INTRINSICS_)
450 return vgetq_lane_f32(V, 3);
451#elif defined(_XM_SSE_INTRINSICS_)
452 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
453 return _mm_cvtss_f32(vTemp);
454#else // _XM_VMX128_INTRINSICS_
455#endif // _XM_VMX128_INTRINSICS_
456}
457
458//------------------------------------------------------------------------------
459
460// Store a component indexed by i into a 32 bit float location in memory.
461_Use_decl_annotations_
462inline void XMVectorGetByIndexPtr(float *f, FXMVECTOR V, size_t i)
463{
464 assert( f != NULL );
465 assert( i < 4 );
466 _Analysis_assume_( i < 4 );
467#if defined(_XM_NO_INTRINSICS_)
468 *f = V.vector4_f32[i];
469#elif defined(_XM_ARM_NEON_INTRINSICS_)
470 *f = V.n128_f32[i];
471#elif defined(_XM_SSE_INTRINSICS_)
472 *f = V.m128_f32[i];
473#else // _XM_VMX128_INTRINSICS_
474#endif // _XM_VMX128_INTRINSICS_
475}
476
477//------------------------------------------------------------------------------
478
479// Store the X component into a 32 bit float location in memory.
480_Use_decl_annotations_
481inline void XMVectorGetXPtr(float *x, FXMVECTOR V)
482{
483 assert( x != NULL);
484#if defined(_XM_NO_INTRINSICS_)
485 *x = V.vector4_f32[0];
486#elif defined(_XM_ARM_NEON_INTRINSICS_)
487 vst1q_lane_f32(x,V,0);
488#elif defined(_XM_SSE_INTRINSICS_)
489 _mm_store_ss(x,V);
490#else // _XM_VMX128_INTRINSICS_
491#endif // _XM_VMX128_INTRINSICS_
492}
493
494// Store the Y component into a 32 bit float location in memory.
495_Use_decl_annotations_
496inline void XMVectorGetYPtr(float *y, FXMVECTOR V)
497{
498 assert( y != NULL );
499#if defined(_XM_NO_INTRINSICS_)
500 *y = V.vector4_f32[1];
501#elif defined(_XM_ARM_NEON_INTRINSICS_)
502 vst1q_lane_f32(y,V,1);
503#elif defined(_XM_SSE_INTRINSICS_)
504 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
505 _mm_store_ss(y,vResult);
506#else // _XM_VMX128_INTRINSICS_
507#endif // _XM_VMX128_INTRINSICS_
508}
509
510// Store the Z component into a 32 bit float location in memory.
511_Use_decl_annotations_
512inline void XMVectorGetZPtr(float *z, FXMVECTOR V)
513{
514 assert( z != NULL );
515#if defined(_XM_NO_INTRINSICS_)
516 *z = V.vector4_f32[2];
517#elif defined(_XM_ARM_NEON_INTRINSICS_)
518 vst1q_lane_f32(z,V,2);
519#elif defined(_XM_SSE_INTRINSICS_)
520 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
521 _mm_store_ss(z,vResult);
522#else // _XM_VMX128_INTRINSICS_
523#endif // _XM_VMX128_INTRINSICS_
524}
525
526// Store the W component into a 32 bit float location in memory.
527_Use_decl_annotations_
528inline void XMVectorGetWPtr(float *w, FXMVECTOR V)
529{
530 assert( w != NULL );
531#if defined(_XM_NO_INTRINSICS_)
532 *w = V.vector4_f32[3];
533#elif defined(_XM_ARM_NEON_INTRINSICS_)
534 vst1q_lane_f32(w,V,3);
535#elif defined(_XM_SSE_INTRINSICS_)
536 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
537 _mm_store_ss(w,vResult);
538#else // _XM_VMX128_INTRINSICS_
539#endif // _XM_VMX128_INTRINSICS_
540}
541
542//------------------------------------------------------------------------------
543
544// Return an integer value via an index. This is not a recommended
545// function to use due to performance loss.
546inline uint32_t XMVectorGetIntByIndex(FXMVECTOR V, size_t i)
547{
548 assert( i < 4 );
549 _Analysis_assume_( i < 4 );
550#if defined(_XM_NO_INTRINSICS_)
551 return V.vector4_u32[i];
552#elif defined(_XM_ARM_NEON_INTRINSICS_)
553 return V.n128_u32[i];
554#elif defined(_XM_SSE_INTRINSICS_)
555 return V.m128_u32[i];
556#else // _XM_VMX128_INTRINSICS_
557#endif // _XM_VMX128_INTRINSICS_
558}
559
560//------------------------------------------------------------------------------
561
562// Return the X component in an integer register.
563inline uint32_t XMVectorGetIntX(FXMVECTOR V)
564{
565#if defined(_XM_NO_INTRINSICS_)
566 return V.vector4_u32[0];
567#elif defined(_XM_ARM_NEON_INTRINSICS_)
568 return vgetq_lane_u32(V, 0);
569#elif defined(_XM_SSE_INTRINSICS_)
570 return static_cast<uint32_t>(_mm_cvtsi128_si32(_mm_castps_si128(V)));
571#else // _XM_VMX128_INTRINSICS_
572#endif // _XM_VMX128_INTRINSICS_
573}
574
575// Return the Y component in an integer register.
576inline uint32_t XMVectorGetIntY(FXMVECTOR V)
577{
578#if defined(_XM_NO_INTRINSICS_)
579 return V.vector4_u32[1];
580#elif defined(_XM_ARM_NEON_INTRINSICS_)
581 return vgetq_lane_u32(V, 1);
582#elif defined(_XM_SSE_INTRINSICS_)
583 __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(1,1,1,1));
584 return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
585#else // _XM_VMX128_INTRINSICS_
586#endif // _XM_VMX128_INTRINSICS_
587}
588
589// Return the Z component in an integer register.
590inline uint32_t XMVectorGetIntZ(FXMVECTOR V)
591{
592#if defined(_XM_NO_INTRINSICS_)
593 return V.vector4_u32[2];
594#elif defined(_XM_ARM_NEON_INTRINSICS_)
595 return vgetq_lane_u32(V, 2);
596#elif defined(_XM_SSE_INTRINSICS_)
597 __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(2,2,2,2));
598 return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
599#else // _XM_VMX128_INTRINSICS_
600#endif // _XM_VMX128_INTRINSICS_
601}
602
603// Return the W component in an integer register.
604inline uint32_t XMVectorGetIntW(FXMVECTOR V)
605{
606#if defined(_XM_NO_INTRINSICS_)
607 return V.vector4_u32[3];
608#elif defined(_XM_ARM_NEON_INTRINSICS_)
609 return vgetq_lane_u32(V, 3);
610#elif defined(_XM_SSE_INTRINSICS_)
611 __m128i vResulti = _mm_shuffle_epi32(_mm_castps_si128(V),_MM_SHUFFLE(3,3,3,3));
612 return static_cast<uint32_t>(_mm_cvtsi128_si32(vResulti));
613#else // _XM_VMX128_INTRINSICS_
614#endif // _XM_VMX128_INTRINSICS_
615}
616
617//------------------------------------------------------------------------------
618
619// Store a component indexed by i into a 32 bit integer location in memory.
620_Use_decl_annotations_
621inline void XMVectorGetIntByIndexPtr(uint32_t *x, FXMVECTOR V, size_t i)
622{
623 assert( x != NULL );
624 assert( i < 4 );
625 _Analysis_assume_( i < 4 );
626#if defined(_XM_NO_INTRINSICS_)
627 *x = V.vector4_u32[i];
628#elif defined(_XM_ARM_NEON_INTRINSICS_)
629 *x = V.n128_u32[i];
630#elif defined(_XM_SSE_INTRINSICS_)
631 *x = V.m128_u32[i];
632#else // _XM_VMX128_INTRINSICS_
633#endif // _XM_VMX128_INTRINSICS_
634}
635
636//------------------------------------------------------------------------------
637
638// Store the X component into a 32 bit integer location in memory.
639_Use_decl_annotations_
640inline void XMVectorGetIntXPtr(uint32_t *x, FXMVECTOR V)
641{
642 assert( x != NULL );
643#if defined(_XM_NO_INTRINSICS_)
644 *x = V.vector4_u32[0];
645#elif defined(_XM_ARM_NEON_INTRINSICS_)
646 vst1q_lane_u32(x,V,0);
647#elif defined(_XM_SSE_INTRINSICS_)
648 _mm_store_ss(reinterpret_cast<float *>(x),V);
649#else // _XM_VMX128_INTRINSICS_
650#endif // _XM_VMX128_INTRINSICS_
651}
652
653// Store the Y component into a 32 bit integer location in memory.
654_Use_decl_annotations_
655inline void XMVectorGetIntYPtr(uint32_t *y, FXMVECTOR V)
656{
657 assert( y != NULL );
658#if defined(_XM_NO_INTRINSICS_)
659 *y = V.vector4_u32[1];
660#elif defined(_XM_ARM_NEON_INTRINSICS_)
661 vst1q_lane_u32(y,V,1);
662#elif defined(_XM_SSE_INTRINSICS_)
663 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
664 _mm_store_ss(reinterpret_cast<float *>(y),vResult);
665#else // _XM_VMX128_INTRINSICS_
666#endif // _XM_VMX128_INTRINSICS_
667}
668
669// Store the Z component into a 32 bit integer locaCantion in memory.
670_Use_decl_annotations_
671inline void XMVectorGetIntZPtr(uint32_t *z, FXMVECTOR V)
672{
673 assert( z != NULL );
674#if defined(_XM_NO_INTRINSICS_)
675 *z = V.vector4_u32[2];
676#elif defined(_XM_ARM_NEON_INTRINSICS_)
677 vst1q_lane_u32(z,V,2);
678#elif defined(_XM_SSE_INTRINSICS_)
679 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
680 _mm_store_ss(reinterpret_cast<float *>(z),vResult);
681#else // _XM_VMX128_INTRINSICS_
682#endif // _XM_VMX128_INTRINSICS_
683}
684
685// Store the W component into a 32 bit integer location in memory.
686_Use_decl_annotations_
687inline void XMVectorGetIntWPtr(uint32_t *w, FXMVECTOR V)
688{
689 assert( w != NULL );
690#if defined(_XM_NO_INTRINSICS_)
691 *w = V.vector4_u32[3];
692#elif defined(_XM_ARM_NEON_INTRINSICS_)
693 vst1q_lane_u32(w,V,3);
694#elif defined(_XM_SSE_INTRINSICS_)
695 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
696 _mm_store_ss(reinterpret_cast<float *>(w),vResult);
697#else // _XM_VMX128_INTRINSICS_
698#endif // _XM_VMX128_INTRINSICS_
699}
700
701//------------------------------------------------------------------------------
702
703// Set a single indexed floating point component
704inline XMVECTOR XMVectorSetByIndex(FXMVECTOR V, float f, size_t i)
705{
706 assert( i < 4 );
707 _Analysis_assume_( i < 4 );
708#if defined(_XM_NO_INTRINSICS_)
709 XMVECTOR U;
710 U = V;
711 U.vector4_f32[i] = f;
712 return U;
713#elif defined(_XM_ARM_NEON_INTRINSICS_)
714 XMVECTOR U = V;
715 U.n128_f32[i] = f;
716 return U;
717#elif defined(_XM_SSE_INTRINSICS_)
718 XMVECTOR U = V;
719 U.m128_f32[i] = f;
720 return U;
721#else // _XM_VMX128_INTRINSICS_
722#endif // _XM_VMX128_INTRINSICS_
723}
724
725//------------------------------------------------------------------------------
726
727// Sets the X component of a vector to a passed floating point value
728inline XMVECTOR XMVectorSetX(FXMVECTOR V, float x)
729{
730#if defined(_XM_NO_INTRINSICS_)
731 XMVECTOR U;
732 U.vector4_f32[0] = x;
733 U.vector4_f32[1] = V.vector4_f32[1];
734 U.vector4_f32[2] = V.vector4_f32[2];
735 U.vector4_f32[3] = V.vector4_f32[3];
736 return U;
737#elif defined(_XM_ARM_NEON_INTRINSICS_)
738 return vsetq_lane_f32(x,V,0);
739#elif defined(_XM_SSE_INTRINSICS_)
740 XMVECTOR vResult = _mm_set_ss(x);
741 vResult = _mm_move_ss(V,vResult);
742 return vResult;
743#else // _XM_VMX128_INTRINSICS_
744#endif // _XM_VMX128_INTRINSICS_
745}
746
747// Sets the Y component of a vector to a passed floating point value
748inline XMVECTOR XMVectorSetY(FXMVECTOR V, float y)
749{
750#if defined(_XM_NO_INTRINSICS_)
751 XMVECTOR U;
752 U.vector4_f32[0] = V.vector4_f32[0];
753 U.vector4_f32[1] = y;
754 U.vector4_f32[2] = V.vector4_f32[2];
755 U.vector4_f32[3] = V.vector4_f32[3];
756 return U;
757#elif defined(_XM_ARM_NEON_INTRINSICS_)
758 return vsetq_lane_f32(y,V,1);
759#elif defined(_XM_SSE_INTRINSICS_)
760 // Swap y and x
761 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
762 // Convert input to vector
763 XMVECTOR vTemp = _mm_set_ss(y);
764 // Replace the x component
765 vResult = _mm_move_ss(vResult,vTemp);
766 // Swap y and x again
767 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
768 return vResult;
769#else // _XM_VMX128_INTRINSICS_
770#endif // _XM_VMX128_INTRINSICS_
771}
772// Sets the Z component of a vector to a passed floating point value
773inline XMVECTOR XMVectorSetZ(FXMVECTOR V, float z)
774{
775#if defined(_XM_NO_INTRINSICS_)
776 XMVECTOR U;
777 U.vector4_f32[0] = V.vector4_f32[0];
778 U.vector4_f32[1] = V.vector4_f32[1];
779 U.vector4_f32[2] = z;
780 U.vector4_f32[3] = V.vector4_f32[3];
781 return U;
782#elif defined(_XM_ARM_NEON_INTRINSICS_)
783 return vsetq_lane_f32(z,V,2);
784#elif defined(_XM_SSE_INTRINSICS_)
785 // Swap z and x
786 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
787 // Convert input to vector
788 XMVECTOR vTemp = _mm_set_ss(z);
789 // Replace the x component
790 vResult = _mm_move_ss(vResult,vTemp);
791 // Swap z and x again
792 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
793 return vResult;
794#else // _XM_VMX128_INTRINSICS_
795#endif // _XM_VMX128_INTRINSICS_
796}
797
798// Sets the W component of a vector to a passed floating point value
799inline XMVECTOR XMVectorSetW(FXMVECTOR V, float w)
800{
801#if defined(_XM_NO_INTRINSICS_)
802 XMVECTOR U;
803 U.vector4_f32[0] = V.vector4_f32[0];
804 U.vector4_f32[1] = V.vector4_f32[1];
805 U.vector4_f32[2] = V.vector4_f32[2];
806 U.vector4_f32[3] = w;
807 return U;
808#elif defined(_XM_ARM_NEON_INTRINSICS_)
809 return vsetq_lane_f32(w,V,3);
810#elif defined(_XM_SSE_INTRINSICS_)
811 // Swap w and x
812 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
813 // Convert input to vector
814 XMVECTOR vTemp = _mm_set_ss(w);
815 // Replace the x component
816 vResult = _mm_move_ss(vResult,vTemp);
817 // Swap w and x again
818 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
819 return vResult;
820#else // _XM_VMX128_INTRINSICS_
821#endif // _XM_VMX128_INTRINSICS_
822}
823
824//------------------------------------------------------------------------------
825
826// Sets a component of a vector to a floating point value passed by pointer
827_Use_decl_annotations_
828inline XMVECTOR XMVectorSetByIndexPtr(FXMVECTOR V, const float *f, size_t i)
829{
830 assert( f != NULL );
831 assert( i < 4 );
832 _Analysis_assume_( i < 4 );
833#if defined(_XM_NO_INTRINSICS_)
834 XMVECTOR U;
835 U = V;
836 U.vector4_f32[i] = *f;
837 return U;
838#elif defined(_XM_ARM_NEON_INTRINSICS_)
839 XMVECTOR U = V;
840 U.n128_f32[i] = *f;
841 return U;
842#elif defined(_XM_SSE_INTRINSICS_)
843 XMVECTOR U = V;
844 U.m128_f32[i] = *f;
845 return U;
846#else // _XM_VMX128_INTRINSICS_
847#endif // _XM_VMX128_INTRINSICS_
848}
849
850//------------------------------------------------------------------------------
851
852// Sets the X component of a vector to a floating point value passed by pointer
853_Use_decl_annotations_
854inline XMVECTOR XMVectorSetXPtr(FXMVECTOR V, const float *x)
855{
856 assert( x != NULL );
857#if defined(_XM_NO_INTRINSICS_)
858 XMVECTOR U;
859 U.vector4_f32[0] = *x;
860 U.vector4_f32[1] = V.vector4_f32[1];
861 U.vector4_f32[2] = V.vector4_f32[2];
862 U.vector4_f32[3] = V.vector4_f32[3];
863 return U;
864#elif defined(_XM_ARM_NEON_INTRINSICS_)
865 return vld1q_lane_f32(x,V,0);
866#elif defined(_XM_SSE_INTRINSICS_)
867 XMVECTOR vResult = _mm_load_ss(x);
868 vResult = _mm_move_ss(V,vResult);
869 return vResult;
870#else // _XM_VMX128_INTRINSICS_
871#endif // _XM_VMX128_INTRINSICS_
872}
873
874// Sets the Y component of a vector to a floating point value passed by pointer
875_Use_decl_annotations_
876inline XMVECTOR XMVectorSetYPtr(FXMVECTOR V, const float *y)
877{
878 assert( y != NULL );
879#if defined(_XM_NO_INTRINSICS_)
880 XMVECTOR U;
881 U.vector4_f32[0] = V.vector4_f32[0];
882 U.vector4_f32[1] = *y;
883 U.vector4_f32[2] = V.vector4_f32[2];
884 U.vector4_f32[3] = V.vector4_f32[3];
885 return U;
886#elif defined(_XM_ARM_NEON_INTRINSICS_)
887 return vld1q_lane_f32(y,V,1);
888#elif defined(_XM_SSE_INTRINSICS_)
889 // Swap y and x
890 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
891 // Convert input to vector
892 XMVECTOR vTemp = _mm_load_ss(y);
893 // Replace the x component
894 vResult = _mm_move_ss(vResult,vTemp);
895 // Swap y and x again
896 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
897 return vResult;
898#else // _XM_VMX128_INTRINSICS_
899#endif // _XM_VMX128_INTRINSICS_
900}
901
902// Sets the Z component of a vector to a floating point value passed by pointer
903_Use_decl_annotations_
904inline XMVECTOR XMVectorSetZPtr(FXMVECTOR V, const float *z)
905{
906 assert( z != NULL );
907#if defined(_XM_NO_INTRINSICS_)
908 XMVECTOR U;
909 U.vector4_f32[0] = V.vector4_f32[0];
910 U.vector4_f32[1] = V.vector4_f32[1];
911 U.vector4_f32[2] = *z;
912 U.vector4_f32[3] = V.vector4_f32[3];
913 return U;
914#elif defined(_XM_ARM_NEON_INTRINSICS_)
915 return vld1q_lane_f32(z,V,2);
916#elif defined(_XM_SSE_INTRINSICS_)
917 // Swap z and x
918 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
919 // Convert input to vector
920 XMVECTOR vTemp = _mm_load_ss(z);
921 // Replace the x component
922 vResult = _mm_move_ss(vResult,vTemp);
923 // Swap z and x again
924 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
925 return vResult;
926#else // _XM_VMX128_INTRINSICS_
927#endif // _XM_VMX128_INTRINSICS_
928}
929
930// Sets the W component of a vector to a floating point value passed by pointer
931_Use_decl_annotations_
932inline XMVECTOR XMVectorSetWPtr(FXMVECTOR V, const float *w)
933{
934 assert( w != NULL );
935#if defined(_XM_NO_INTRINSICS_)
936 XMVECTOR U;
937 U.vector4_f32[0] = V.vector4_f32[0];
938 U.vector4_f32[1] = V.vector4_f32[1];
939 U.vector4_f32[2] = V.vector4_f32[2];
940 U.vector4_f32[3] = *w;
941 return U;
942#elif defined(_XM_ARM_NEON_INTRINSICS_)
943 return vld1q_lane_f32(w,V,3);
944#elif defined(_XM_SSE_INTRINSICS_)
945 // Swap w and x
946 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
947 // Convert input to vector
948 XMVECTOR vTemp = _mm_load_ss(w);
949 // Replace the x component
950 vResult = _mm_move_ss(vResult,vTemp);
951 // Swap w and x again
952 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
953 return vResult;
954#else // _XM_VMX128_INTRINSICS_
955#endif // _XM_VMX128_INTRINSICS_
956}
957
958//------------------------------------------------------------------------------
959
960// Sets a component of a vector to an integer passed by value
961inline XMVECTOR XMVectorSetIntByIndex(FXMVECTOR V, uint32_t x, size_t i)
962{
963 assert( i < 4 );
964 _Analysis_assume_( i < 4 );
965#if defined(_XM_NO_INTRINSICS_)
966 XMVECTOR U;
967 U = V;
968 U.vector4_u32[i] = x;
969 return U;
970#elif defined(_XM_ARM_NEON_INTRINSICS_)
971 XMVECTORU32 tmp;
972 tmp.v = V;
973 tmp.u[i] = x;
974 return tmp;
975#elif defined(_XM_SSE_INTRINSICS_)
976 XMVECTORU32 tmp;
977 tmp.v = V;
978 tmp.u[i] = x;
979 return tmp;
980#else // _XM_VMX128_INTRINSICS_
981#endif // _XM_VMX128_INTRINSICS_
982}
983
984//------------------------------------------------------------------------------
985
986// Sets the X component of a vector to an integer passed by value
987inline XMVECTOR XMVectorSetIntX(FXMVECTOR V, uint32_t x)
988{
989#if defined(_XM_NO_INTRINSICS_)
990 XMVECTOR U;
991 U.vector4_u32[0] = x;
992 U.vector4_u32[1] = V.vector4_u32[1];
993 U.vector4_u32[2] = V.vector4_u32[2];
994 U.vector4_u32[3] = V.vector4_u32[3];
995 return U;
996#elif defined(_XM_ARM_NEON_INTRINSICS_)
997 return vsetq_lane_u32(x,V,0);
998#elif defined(_XM_SSE_INTRINSICS_)
999 __m128i vTemp = _mm_cvtsi32_si128(x);
1000 XMVECTOR vResult = _mm_move_ss(V,_mm_castsi128_ps(vTemp));
1001 return vResult;
1002#else // _XM_VMX128_INTRINSICS_
1003#endif // _XM_VMX128_INTRINSICS_
1004}
1005
1006// Sets the Y component of a vector to an integer passed by value
1007inline XMVECTOR XMVectorSetIntY(FXMVECTOR V, uint32_t y)
1008{
1009#if defined(_XM_NO_INTRINSICS_)
1010 XMVECTOR U;
1011 U.vector4_u32[0] = V.vector4_u32[0];
1012 U.vector4_u32[1] = y;
1013 U.vector4_u32[2] = V.vector4_u32[2];
1014 U.vector4_u32[3] = V.vector4_u32[3];
1015 return U;
1016#elif defined(_XM_ARM_NEON_INTRINSICS_)
1017 return vsetq_lane_u32(y,V,1);
1018#elif defined(_XM_SSE_INTRINSICS_)
1019 // Swap y and x
1020 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
1021 // Convert input to vector
1022 __m128i vTemp = _mm_cvtsi32_si128(y);
1023 // Replace the x component
1024 vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
1025 // Swap y and x again
1026 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
1027 return vResult;
1028#else // _XM_VMX128_INTRINSICS_
1029#endif // _XM_VMX128_INTRINSICS_
1030}
1031
1032// Sets the Z component of a vector to an integer passed by value
1033inline XMVECTOR XMVectorSetIntZ(FXMVECTOR V, uint32_t z)
1034{
1035#if defined(_XM_NO_INTRINSICS_)
1036 XMVECTOR U;
1037 U.vector4_u32[0] = V.vector4_u32[0];
1038 U.vector4_u32[1] = V.vector4_u32[1];
1039 U.vector4_u32[2] = z;
1040 U.vector4_u32[3] = V.vector4_u32[3];
1041 return U;
1042#elif defined(_XM_ARM_NEON_INTRINSICS_)
1043 return vsetq_lane_u32(z,V,2);
1044#elif defined(_XM_SSE_INTRINSICS_)
1045 // Swap z and x
1046 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
1047 // Convert input to vector
1048 __m128i vTemp = _mm_cvtsi32_si128(z);
1049 // Replace the x component
1050 vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
1051 // Swap z and x again
1052 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
1053 return vResult;
1054#else // _XM_VMX128_INTRINSICS_
1055#endif // _XM_VMX128_INTRINSICS_
1056}
1057
1058// Sets the W component of a vector to an integer passed by value
1059inline XMVECTOR XMVectorSetIntW(FXMVECTOR V, uint32_t w)
1060{
1061#if defined(_XM_NO_INTRINSICS_)
1062 XMVECTOR U;
1063 U.vector4_u32[0] = V.vector4_u32[0];
1064 U.vector4_u32[1] = V.vector4_u32[1];
1065 U.vector4_u32[2] = V.vector4_u32[2];
1066 U.vector4_u32[3] = w;
1067 return U;
1068#elif defined(_XM_ARM_NEON_INTRINSICS_)
1069 return vsetq_lane_u32(w,V,3);
1070#elif defined(_XM_SSE_INTRINSICS_)
1071 // Swap w and x
1072 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
1073 // Convert input to vector
1074 __m128i vTemp = _mm_cvtsi32_si128(w);
1075 // Replace the x component
1076 vResult = _mm_move_ss(vResult,_mm_castsi128_ps(vTemp));
1077 // Swap w and x again
1078 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
1079 return vResult;
1080#else // _XM_VMX128_INTRINSICS_
1081#endif // _XM_VMX128_INTRINSICS_
1082}
1083
1084//------------------------------------------------------------------------------
1085
1086// Sets a component of a vector to an integer value passed by pointer
1087_Use_decl_annotations_
1088inline XMVECTOR XMVectorSetIntByIndexPtr(FXMVECTOR V, const uint32_t *x, size_t i)
1089{
1090 assert( x != NULL );
1091 assert( i < 4 );
1092 _Analysis_assume_( i < 4 );
1093#if defined(_XM_NO_INTRINSICS_)
1094 XMVECTOR U;
1095 U = V;
1096 U.vector4_u32[i] = *x;
1097 return U;
1098#elif defined(_XM_ARM_NEON_INTRINSICS_)
1099 XMVECTORU32 tmp;
1100 tmp.v = V;
1101 tmp.u[i] = *x;
1102 return tmp;
1103#elif defined(_XM_SSE_INTRINSICS_)
1104 XMVECTORU32 tmp;
1105 tmp.v = V;
1106 tmp.u[i] = *x;
1107 return tmp;
1108#else // _XM_VMX128_INTRINSICS_
1109#endif // _XM_VMX128_INTRINSICS_
1110}
1111
1112//------------------------------------------------------------------------------
1113
1114// Sets the X component of a vector to an integer value passed by pointer
1115_Use_decl_annotations_
1116inline XMVECTOR XMVectorSetIntXPtr(FXMVECTOR V, const uint32_t *x)
1117{
1118 assert( x != NULL );
1119#if defined(_XM_NO_INTRINSICS_)
1120 XMVECTOR U;
1121 U.vector4_u32[0] = *x;
1122 U.vector4_u32[1] = V.vector4_u32[1];
1123 U.vector4_u32[2] = V.vector4_u32[2];
1124 U.vector4_u32[3] = V.vector4_u32[3];
1125 return U;
1126#elif defined(_XM_ARM_NEON_INTRINSICS_)
1127 return vld1q_lane_u32(x,V,0);
1128#elif defined(_XM_SSE_INTRINSICS_)
1129 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(x));
1130 XMVECTOR vResult = _mm_move_ss(V,vTemp);
1131 return vResult;
1132#else // _XM_VMX128_INTRINSICS_
1133#endif // _XM_VMX128_INTRINSICS_
1134}
1135
1136// Sets the Y component of a vector to an integer value passed by pointer
1137_Use_decl_annotations_
1138inline XMVECTOR XMVectorSetIntYPtr(FXMVECTOR V, const uint32_t *y)
1139{
1140 assert( y != NULL );
1141#if defined(_XM_NO_INTRINSICS_)
1142 XMVECTOR U;
1143 U.vector4_u32[0] = V.vector4_u32[0];
1144 U.vector4_u32[1] = *y;
1145 U.vector4_u32[2] = V.vector4_u32[2];
1146 U.vector4_u32[3] = V.vector4_u32[3];
1147 return U;
1148#elif defined(_XM_ARM_NEON_INTRINSICS_)
1149 return vld1q_lane_u32(y,V,1);
1150#elif defined(_XM_SSE_INTRINSICS_)
1151 // Swap y and x
1152 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
1153 // Convert input to vector
1154 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(y));
1155 // Replace the x component
1156 vResult = _mm_move_ss(vResult,vTemp);
1157 // Swap y and x again
1158 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,2,0,1));
1159 return vResult;
1160#else // _XM_VMX128_INTRINSICS_
1161#endif // _XM_VMX128_INTRINSICS_
1162}
1163
1164// Sets the Z component of a vector to an integer value passed by pointer
1165_Use_decl_annotations_
1166inline XMVECTOR XMVectorSetIntZPtr(FXMVECTOR V, const uint32_t *z)
1167{
1168 assert( z != NULL );
1169#if defined(_XM_NO_INTRINSICS_)
1170 XMVECTOR U;
1171 U.vector4_u32[0] = V.vector4_u32[0];
1172 U.vector4_u32[1] = V.vector4_u32[1];
1173 U.vector4_u32[2] = *z;
1174 U.vector4_u32[3] = V.vector4_u32[3];
1175 return U;
1176#elif defined(_XM_ARM_NEON_INTRINSICS_)
1177 return vld1q_lane_u32(z,V,2);
1178#elif defined(_XM_SSE_INTRINSICS_)
1179 // Swap z and x
1180 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,0,1,2));
1181 // Convert input to vector
1182 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(z));
1183 // Replace the x component
1184 vResult = _mm_move_ss(vResult,vTemp);
1185 // Swap z and x again
1186 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(3,0,1,2));
1187 return vResult;
1188#else // _XM_VMX128_INTRINSICS_
1189#endif // _XM_VMX128_INTRINSICS_
1190}
1191
1192// Sets the W component of a vector to an integer value passed by pointer
1193_Use_decl_annotations_
1194inline XMVECTOR XMVectorSetIntWPtr(FXMVECTOR V, const uint32_t *w)
1195{
1196 assert( w != NULL );
1197#if defined(_XM_NO_INTRINSICS_)
1198 XMVECTOR U;
1199 U.vector4_u32[0] = V.vector4_u32[0];
1200 U.vector4_u32[1] = V.vector4_u32[1];
1201 U.vector4_u32[2] = V.vector4_u32[2];
1202 U.vector4_u32[3] = *w;
1203 return U;
1204#elif defined(_XM_ARM_NEON_INTRINSICS_)
1205 return vld1q_lane_u32(w,V,3);
1206#elif defined(_XM_SSE_INTRINSICS_)
1207 // Swap w and x
1208 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,2,1,3));
1209 // Convert input to vector
1210 XMVECTOR vTemp = _mm_load_ss(reinterpret_cast<const float *>(w));
1211 // Replace the x component
1212 vResult = _mm_move_ss(vResult,vTemp);
1213 // Swap w and x again
1214 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,2,1,3));
1215 return vResult;
1216#else // _XM_VMX128_INTRINSICS_
1217#endif // _XM_VMX128_INTRINSICS_
1218}
1219
1220//------------------------------------------------------------------------------
1221
1222inline XMVECTOR XMVectorSwizzle
1223(
1224 FXMVECTOR V,
1225 uint32_t E0,
1226 uint32_t E1,
1227 uint32_t E2,
1228 uint32_t E3
1229)
1230{
1231 assert( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
1232 _Analysis_assume_( (E0 < 4) && (E1 < 4) && (E2 < 4) && (E3 < 4) );
1233#if defined(_XM_NO_INTRINSICS_)
1234
1235 XMVECTOR Result = { V.vector4_f32[E0],
1236 V.vector4_f32[E1],
1237 V.vector4_f32[E2],
1238 V.vector4_f32[E3] };
1239 return Result;
1240
1241#elif defined(_XM_ARM_NEON_INTRINSICS_)
1242 static const uint32_t ControlElement[ 4 ] =
1243 {
1244#ifdef _XM_LITTLEENDIAN_
1245 0x03020100, // XM_SWIZZLE_X
1246 0x07060504, // XM_SWIZZLE_Y
1247 0x0B0A0908, // XM_SWIZZLE_Z
1248 0x0F0E0D0C, // XM_SWIZZLE_W
1249#else
1250 0x00010203, // XM_SWIZZLE_X
1251 0x04050607, // XM_SWIZZLE_Y
1252 0x08090A0B, // XM_SWIZZLE_Z
1253 0x0C0D0E0F, // XM_SWIZZLE_W
1254#endif
1255 };
1256
1257 int8x8x2_t tbl;
1258 tbl.val[0] = vget_low_f32(V);
1259 tbl.val[1] = vget_high_f32(V);
1260
1261 __n64 idx = vcreate_u32( ((uint64_t)ControlElement[E0]) | (((uint64_t)ControlElement[E1]) << 32) );
1262 const __n64 rL = vtbl2_u8( tbl, idx );
1263
1264 idx = vcreate_u32( ((uint64_t)ControlElement[E2]) | (((uint64_t)ControlElement[E3]) << 32) );
1265 const __n64 rH = vtbl2_u8( tbl, idx );
1266
1267 return vcombine_f32( rL, rH );
1268#elif defined(_XM_VMX128_INTRINSICS_)
1269#else
1270 const uint32_t *aPtr = (const uint32_t* )(&V);
1271
1272 XMVECTOR Result;
1273 uint32_t *pWork = (uint32_t*)(&Result);
1274
1275 pWork[0] = aPtr[E0];
1276 pWork[1] = aPtr[E1];
1277 pWork[2] = aPtr[E2];
1278 pWork[3] = aPtr[E3];
1279
1280 return Result;
1281#endif
1282}
1283
1284//------------------------------------------------------------------------------
1285inline XMVECTOR XMVectorPermute
1286(
1287 FXMVECTOR V1,
1288 FXMVECTOR V2,
1289 uint32_t PermuteX,
1290 uint32_t PermuteY,
1291 uint32_t PermuteZ,
1292 uint32_t PermuteW
1293)
1294{
1295 assert( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
1296 _Analysis_assume_( PermuteX <= 7 && PermuteY <= 7 && PermuteZ <= 7 && PermuteW <= 7 );
1297
1298#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1299 static const uint32_t ControlElement[ 8 ] =
1300 {
1301#ifdef _XM_LITTLEENDIAN_
1302 0x03020100, // XM_PERMUTE_0X
1303 0x07060504, // XM_PERMUTE_0Y
1304 0x0B0A0908, // XM_PERMUTE_0Z
1305 0x0F0E0D0C, // XM_PERMUTE_0W
1306 0x13121110, // XM_PERMUTE_1X
1307 0x17161514, // XM_PERMUTE_1Y
1308 0x1B1A1918, // XM_PERMUTE_1Z
1309 0x1F1E1D1C, // XM_PERMUTE_1W
1310#else
1311 0x00010203, // XM_PERMUTE_0X
1312 0x04050607, // XM_PERMUTE_0Y
1313 0x08090A0B, // XM_PERMUTE_0Z
1314 0x0C0D0E0F, // XM_PERMUTE_0W
1315 0x10111213, // XM_PERMUTE_1X
1316 0x14151617, // XM_PERMUTE_1Y
1317 0x18191A1B, // XM_PERMUTE_1Z
1318 0x1C1D1E1F, // XM_PERMUTE_1W
1319#endif
1320 };
1321
1322 int8x8x4_t tbl;
1323 tbl.val[0] = vget_low_f32(V1);
1324 tbl.val[1] = vget_high_f32(V1);
1325 tbl.val[2] = vget_low_f32(V2);
1326 tbl.val[3] = vget_high_f32(V2);
1327
1328 __n64 idx = vcreate_u32( ((uint64_t)ControlElement[PermuteX]) | (((uint64_t)ControlElement[PermuteY]) << 32) );
1329 const __n64 rL = vtbl4_u8( tbl, idx );
1330
1331 idx = vcreate_u32( ((uint64_t)ControlElement[PermuteZ]) | (((uint64_t)ControlElement[PermuteW]) << 32) );
1332 const __n64 rH = vtbl4_u8( tbl, idx );
1333
1334 return vcombine_f32( rL, rH );
1335#elif defined(_XM_VMX128_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1336#else
1337
1338 const uint32_t *aPtr[2];
1339 aPtr[0] = (const uint32_t* )(&V1);
1340 aPtr[1] = (const uint32_t* )(&V2);
1341
1342 XMVECTOR Result;
1343 uint32_t *pWork = (uint32_t*)(&Result);
1344
1345 const uint32_t i0 = PermuteX & 3;
1346 const uint32_t vi0 = PermuteX >> 2;
1347 pWork[0] = aPtr[vi0][i0];
1348
1349 const uint32_t i1 = PermuteY & 3;
1350 const uint32_t vi1 = PermuteY >> 2;
1351 pWork[1] = aPtr[vi1][i1];
1352
1353 const uint32_t i2 = PermuteZ & 3;
1354 const uint32_t vi2 = PermuteZ >> 2;
1355 pWork[2] = aPtr[vi2][i2];
1356
1357 const uint32_t i3 = PermuteW & 3;
1358 const uint32_t vi3 = PermuteW >> 2;
1359 pWork[3] = aPtr[vi3][i3];
1360
1361 return Result;
1362#endif
1363}
1364
1365//------------------------------------------------------------------------------
1366// Define a control vector to be used in XMVectorSelect
1367// operations. The four integers specified in XMVectorSelectControl
1368// serve as indices to select between components in two vectors.
1369// The first index controls selection for the first component of
1370// the vectors involved in a select operation, the second index
1371// controls selection for the second component etc. A value of
1372// zero for an index causes the corresponding component from the first
1373// vector to be selected whereas a one causes the component from the
1374// second vector to be selected instead.
1375
1376inline XMVECTOR XMVectorSelectControl
1377(
1378 uint32_t VectorIndex0,
1379 uint32_t VectorIndex1,
1380 uint32_t VectorIndex2,
1381 uint32_t VectorIndex3
1382)
1383{
1384#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1385 // x=Index0,y=Index1,z=Index2,w=Index3
1386 __m128i vTemp = _mm_set_epi32(VectorIndex3,VectorIndex2,VectorIndex1,VectorIndex0);
1387 // Any non-zero entries become 0xFFFFFFFF else 0
1388 vTemp = _mm_cmpgt_epi32(vTemp,g_XMZero);
1389 return reinterpret_cast<__m128 *>(&vTemp)[0];
1390#elif defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1391 __n64 V0 = vcreate_s32(((uint64_t)VectorIndex0) | ((uint64_t)VectorIndex1 << 32));
1392 __n64 V1 = vcreate_s32(((uint64_t)VectorIndex2) | ((uint64_t)VectorIndex3 << 32));
1393 __n128 vTemp = vcombine_s32(V0, V1);
1394 // Any non-zero entries become 0xFFFFFFFF else 0
1395 return vcgtq_s32(vTemp,g_XMZero);
1396#else
1397 XMVECTOR ControlVector;
1398 const uint32_t ControlElement[] =
1399 {
1400 XM_SELECT_0,
1401 XM_SELECT_1
1402 };
1403
1404 assert(VectorIndex0 < 2);
1405 assert(VectorIndex1 < 2);
1406 assert(VectorIndex2 < 2);
1407 assert(VectorIndex3 < 2);
1408 _Analysis_assume_(VectorIndex0 < 2);
1409 _Analysis_assume_(VectorIndex1 < 2);
1410 _Analysis_assume_(VectorIndex2 < 2);
1411 _Analysis_assume_(VectorIndex3 < 2);
1412
1413 ControlVector.vector4_u32[0] = ControlElement[VectorIndex0];
1414 ControlVector.vector4_u32[1] = ControlElement[VectorIndex1];
1415 ControlVector.vector4_u32[2] = ControlElement[VectorIndex2];
1416 ControlVector.vector4_u32[3] = ControlElement[VectorIndex3];
1417
1418 return ControlVector;
1419
1420#endif
1421}
1422
1423//------------------------------------------------------------------------------
1424
1425inline XMVECTOR XMVectorSelect
1426(
1427 FXMVECTOR V1,
1428 FXMVECTOR V2,
1429 FXMVECTOR Control
1430)
1431{
1432#if defined(_XM_NO_INTRINSICS_)
1433
1434 XMVECTOR Result;
1435 Result.vector4_u32[0] = (V1.vector4_u32[0] & ~Control.vector4_u32[0]) | (V2.vector4_u32[0] & Control.vector4_u32[0]);
1436 Result.vector4_u32[1] = (V1.vector4_u32[1] & ~Control.vector4_u32[1]) | (V2.vector4_u32[1] & Control.vector4_u32[1]);
1437 Result.vector4_u32[2] = (V1.vector4_u32[2] & ~Control.vector4_u32[2]) | (V2.vector4_u32[2] & Control.vector4_u32[2]);
1438 Result.vector4_u32[3] = (V1.vector4_u32[3] & ~Control.vector4_u32[3]) | (V2.vector4_u32[3] & Control.vector4_u32[3]);
1439 return Result;
1440
1441#elif defined(_XM_ARM_NEON_INTRINSICS_)
1442 return vbslq_f32( Control, V2, V1 );
1443#elif defined(_XM_SSE_INTRINSICS_)
1444 XMVECTOR vTemp1 = _mm_andnot_ps(Control,V1);
1445 XMVECTOR vTemp2 = _mm_and_ps(V2,Control);
1446 return _mm_or_ps(vTemp1,vTemp2);
1447#else // _XM_VMX128_INTRINSICS_
1448#endif // _XM_VMX128_INTRINSICS_
1449}
1450
1451//------------------------------------------------------------------------------
1452
1453inline XMVECTOR XMVectorMergeXY
1454(
1455 FXMVECTOR V1,
1456 FXMVECTOR V2
1457)
1458{
1459#if defined(_XM_NO_INTRINSICS_)
1460
1461 XMVECTOR Result;
1462 Result.vector4_u32[0] = V1.vector4_u32[0];
1463 Result.vector4_u32[1] = V2.vector4_u32[0];
1464 Result.vector4_u32[2] = V1.vector4_u32[1];
1465 Result.vector4_u32[3] = V2.vector4_u32[1];
1466 return Result;
1467
1468#elif defined(_XM_ARM_NEON_INTRINSICS_)
1469 return vzipq_f32( V1, V2 ).val[0];
1470#elif defined(_XM_SSE_INTRINSICS_)
1471 return _mm_unpacklo_ps( V1, V2 );
1472#else // _XM_VMX128_INTRINSICS_
1473#endif // _XM_VMX128_INTRINSICS_
1474}
1475
1476//------------------------------------------------------------------------------
1477
1478inline XMVECTOR XMVectorMergeZW
1479(
1480 FXMVECTOR V1,
1481 FXMVECTOR V2
1482)
1483{
1484#if defined(_XM_NO_INTRINSICS_)
1485
1486 XMVECTOR Result;
1487 Result.vector4_u32[0] = V1.vector4_u32[2];
1488 Result.vector4_u32[1] = V2.vector4_u32[2];
1489 Result.vector4_u32[2] = V1.vector4_u32[3];
1490 Result.vector4_u32[3] = V2.vector4_u32[3];
1491 return Result;
1492
1493#elif defined(_XM_ARM_NEON_INTRINSICS_)
1494 return vzipq_f32( V1, V2 ).val[1];
1495#elif defined(_XM_SSE_INTRINSICS_)
1496 return _mm_unpackhi_ps( V1, V2 );
1497#else // _XM_VMX128_INTRINSICS_
1498#endif // _XM_VMX128_INTRINSICS_
1499}
1500
1501//------------------------------------------------------------------------------
1502
1503inline XMVECTOR XMVectorShiftLeft(FXMVECTOR V1, FXMVECTOR V2, uint32_t Elements)
1504{
1505 assert( Elements < 4 );
1506 _Analysis_assume_( Elements < 4 );
1507 return XMVectorPermute(V1, V2, Elements, ((Elements) + 1), ((Elements) + 2), ((Elements) + 3));
1508}
1509
1510//------------------------------------------------------------------------------
1511
1512inline XMVECTOR XMVectorRotateLeft(FXMVECTOR V, uint32_t Elements)
1513{
1514 assert( Elements < 4 );
1515 _Analysis_assume_( Elements < 4 );
1516 return XMVectorSwizzle( V, Elements & 3, (Elements + 1) & 3, (Elements + 2) & 3, (Elements + 3) & 3 );
1517}
1518
1519//------------------------------------------------------------------------------
1520
1521inline XMVECTOR XMVectorRotateRight(FXMVECTOR V, uint32_t Elements)
1522{
1523 assert( Elements < 4 );
1524 _Analysis_assume_( Elements < 4 );
1525 return XMVectorSwizzle( V, (4 - (Elements)) & 3, (5 - (Elements)) & 3, (6 - (Elements)) & 3, (7 - (Elements)) & 3 );
1526}
1527
1528//------------------------------------------------------------------------------
1529
1530inline XMVECTOR XMVectorInsert(FXMVECTOR VD, FXMVECTOR VS, uint32_t VSLeftRotateElements,
1531 uint32_t Select0, uint32_t Select1, uint32_t Select2, uint32_t Select3)
1532{
1533 XMVECTOR Control = XMVectorSelectControl(Select0&1, Select1&1, Select2&1, Select3&1);
1534 return XMVectorSelect( VD, XMVectorRotateLeft(VS, VSLeftRotateElements), Control );
1535}
1536
1537//------------------------------------------------------------------------------
1538// Comparison operations
1539//------------------------------------------------------------------------------
1540
1541//------------------------------------------------------------------------------
1542
1543inline XMVECTOR XMVectorEqual
1544(
1545 FXMVECTOR V1,
1546 FXMVECTOR V2
1547)
1548{
1549#if defined(_XM_NO_INTRINSICS_)
1550
1551 XMVECTOR Control;
1552 Control.vector4_u32[0] = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
1553 Control.vector4_u32[1] = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
1554 Control.vector4_u32[2] = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
1555 Control.vector4_u32[3] = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
1556 return Control;
1557
1558#elif defined(_XM_ARM_NEON_INTRINSICS_)
1559 return vceqq_f32( V1, V2 );
1560#elif defined(_XM_SSE_INTRINSICS_)
1561 return _mm_cmpeq_ps( V1, V2 );
1562#else // _XM_VMX128_INTRINSICS_
1563#endif // _XM_VMX128_INTRINSICS_
1564}
1565
1566//------------------------------------------------------------------------------
1567
1568_Use_decl_annotations_
1569inline XMVECTOR XMVectorEqualR
1570(
1571 uint32_t* pCR,
1572 FXMVECTOR V1,
1573 FXMVECTOR V2
1574)
1575{
1576 assert( pCR != NULL );
1577#if defined(_XM_NO_INTRINSICS_)
1578 uint32_t ux = (V1.vector4_f32[0] == V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
1579 uint32_t uy = (V1.vector4_f32[1] == V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
1580 uint32_t uz = (V1.vector4_f32[2] == V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
1581 uint32_t uw = (V1.vector4_f32[3] == V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
1582 uint32_t CR = 0;
1583 if (ux&uy&uz&uw)
1584 {
1585 // All elements are greater
1586 CR = XM_CRMASK_CR6TRUE;
1587 }
1588 else if (!(ux|uy|uz|uw))
1589 {
1590 // All elements are not greater
1591 CR = XM_CRMASK_CR6FALSE;
1592 }
1593 *pCR = CR;
1594
1595 XMVECTOR Control;
1596 Control.vector4_u32[0] = ux;
1597 Control.vector4_u32[1] = uy;
1598 Control.vector4_u32[2] = uz;
1599 Control.vector4_u32[3] = uw;
1600 return Control;
1601
1602#elif defined(_XM_ARM_NEON_INTRINSICS_)
1603 __n128 vResult = vceqq_f32( V1, V2 );
1604 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
1605 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
1606 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
1607 uint32_t CR = 0;
1608 if ( r == 0xFFFFFFFFU )
1609 {
1610 // All elements are equal
1611 CR = XM_CRMASK_CR6TRUE;
1612 }
1613 else if ( !r )
1614 {
1615 // All elements are not equal
1616 CR = XM_CRMASK_CR6FALSE;
1617 }
1618 *pCR = CR;
1619 return vResult;
1620#elif defined(_XM_SSE_INTRINSICS_)
1621 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
1622 uint32_t CR = 0;
1623 int iTest = _mm_movemask_ps(vTemp);
1624 if (iTest==0xf)
1625 {
1626 CR = XM_CRMASK_CR6TRUE;
1627 }
1628 else if (!iTest)
1629 {
1630 // All elements are not greater
1631 CR = XM_CRMASK_CR6FALSE;
1632 }
1633 *pCR = CR;
1634 return vTemp;
1635#else // _XM_VMX128_INTRINSICS_
1636#endif // _XM_VMX128_INTRINSICS_
1637}
1638
1639//------------------------------------------------------------------------------
1640// Treat the components of the vectors as unsigned integers and
1641// compare individual bits between the two. This is useful for
1642// comparing control vectors and result vectors returned from
1643// other comparison operations.
1644
1645inline XMVECTOR XMVectorEqualInt
1646(
1647 FXMVECTOR V1,
1648 FXMVECTOR V2
1649)
1650{
1651#if defined(_XM_NO_INTRINSICS_)
1652
1653 XMVECTOR Control;
1654 Control.vector4_u32[0] = (V1.vector4_u32[0] == V2.vector4_u32[0]) ? 0xFFFFFFFF : 0;
1655 Control.vector4_u32[1] = (V1.vector4_u32[1] == V2.vector4_u32[1]) ? 0xFFFFFFFF : 0;
1656 Control.vector4_u32[2] = (V1.vector4_u32[2] == V2.vector4_u32[2]) ? 0xFFFFFFFF : 0;
1657 Control.vector4_u32[3] = (V1.vector4_u32[3] == V2.vector4_u32[3]) ? 0xFFFFFFFF : 0;
1658 return Control;
1659
1660#elif defined(_XM_ARM_NEON_INTRINSICS_)
1661 return vceqq_u32( V1, V2 );
1662#elif defined(_XM_SSE_INTRINSICS_)
1663 __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
1664 return reinterpret_cast<__m128 *>(&V)[0];
1665#else // _XM_VMX128_INTRINSICS_
1666#endif // _XM_VMX128_INTRINSICS_
1667}
1668
1669//------------------------------------------------------------------------------
1670
1671_Use_decl_annotations_
1672inline XMVECTOR XMVectorEqualIntR
1673(
1674 uint32_t* pCR,
1675 FXMVECTOR V1,
1676 FXMVECTOR V2
1677)
1678{
1679 assert( pCR != NULL );
1680#if defined(_XM_NO_INTRINSICS_)
1681
1682 XMVECTOR Control = XMVectorEqualInt(V1, V2);
1683
1684 *pCR = 0;
1685 if (XMVector4EqualInt(Control, XMVectorTrueInt()))
1686 {
1687 // All elements are equal
1688 *pCR |= XM_CRMASK_CR6TRUE;
1689 }
1690 else if (XMVector4EqualInt(Control, XMVectorFalseInt()))
1691 {
1692 // All elements are not equal
1693 *pCR |= XM_CRMASK_CR6FALSE;
1694 }
1695 return Control;
1696
1697#elif defined(_XM_ARM_NEON_INTRINSICS_)
1698 __n128 vResult = vceqq_u32( V1, V2 );
1699 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
1700 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
1701 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
1702 uint32_t CR = 0;
1703 if ( r == 0xFFFFFFFFU )
1704 {
1705 // All elements are equal
1706 CR = XM_CRMASK_CR6TRUE;
1707 }
1708 else if ( !r )
1709 {
1710 // All elements are not equal
1711 CR = XM_CRMASK_CR6FALSE;
1712 }
1713 *pCR = CR;
1714 return vResult;
1715#elif defined(_XM_SSE_INTRINSICS_)
1716 __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
1717 int iTemp = _mm_movemask_ps(reinterpret_cast<const __m128*>(&V)[0]);
1718 uint32_t CR = 0;
1719 if (iTemp==0x0F)
1720 {
1721 CR = XM_CRMASK_CR6TRUE;
1722 }
1723 else if (!iTemp)
1724 {
1725 CR = XM_CRMASK_CR6FALSE;
1726 }
1727 *pCR = CR;
1728 return reinterpret_cast<__m128 *>(&V)[0];
1729#else // _XM_VMX128_INTRINSICS_
1730#endif // _XM_VMX128_INTRINSICS_
1731}
1732
1733//------------------------------------------------------------------------------
1734
1735inline XMVECTOR XMVectorNearEqual
1736(
1737 FXMVECTOR V1,
1738 FXMVECTOR V2,
1739 FXMVECTOR Epsilon
1740)
1741{
1742#if defined(_XM_NO_INTRINSICS_)
1743
1744 float fDeltax = V1.vector4_f32[0]-V2.vector4_f32[0];
1745 float fDeltay = V1.vector4_f32[1]-V2.vector4_f32[1];
1746 float fDeltaz = V1.vector4_f32[2]-V2.vector4_f32[2];
1747 float fDeltaw = V1.vector4_f32[3]-V2.vector4_f32[3];
1748
1749 fDeltax = fabsf(fDeltax);
1750 fDeltay = fabsf(fDeltay);
1751 fDeltaz = fabsf(fDeltaz);
1752 fDeltaw = fabsf(fDeltaw);
1753
1754 XMVECTOR Control;
1755 Control.vector4_u32[0] = (fDeltax <= Epsilon.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
1756 Control.vector4_u32[1] = (fDeltay <= Epsilon.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
1757 Control.vector4_u32[2] = (fDeltaz <= Epsilon.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
1758 Control.vector4_u32[3] = (fDeltaw <= Epsilon.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
1759 return Control;
1760
1761#elif defined(_XM_ARM_NEON_INTRINSICS_)
1762 XMVECTOR vDelta = vsubq_f32(V1,V2);
1763 return vacleq_f32( vDelta, Epsilon );
1764#elif defined(_XM_SSE_INTRINSICS_)
1765 // Get the difference
1766 XMVECTOR vDelta = _mm_sub_ps(V1,V2);
1767 // Get the absolute value of the difference
1768 XMVECTOR vTemp = _mm_setzero_ps();
1769 vTemp = _mm_sub_ps(vTemp,vDelta);
1770 vTemp = _mm_max_ps(vTemp,vDelta);
1771 vTemp = _mm_cmple_ps(vTemp,Epsilon);
1772 return vTemp;
1773#else // _XM_VMX128_INTRINSICS_
1774#endif // _XM_VMX128_INTRINSICS_
1775}
1776
1777//------------------------------------------------------------------------------
1778
1779inline XMVECTOR XMVectorNotEqual
1780(
1781 FXMVECTOR V1,
1782 FXMVECTOR V2
1783)
1784{
1785#if defined(_XM_NO_INTRINSICS_)
1786
1787 XMVECTOR Control;
1788 Control.vector4_u32[0] = (V1.vector4_f32[0] != V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
1789 Control.vector4_u32[1] = (V1.vector4_f32[1] != V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
1790 Control.vector4_u32[2] = (V1.vector4_f32[2] != V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
1791 Control.vector4_u32[3] = (V1.vector4_f32[3] != V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
1792 return Control;
1793
1794#elif defined(_XM_ARM_NEON_INTRINSICS_)
1795 return vmvnq_u32(vceqq_f32(V1, V2));
1796#elif defined(_XM_SSE_INTRINSICS_)
1797 return _mm_cmpneq_ps( V1, V2 );
1798#else // _XM_VMX128_INTRINSICS_
1799#endif // _XM_VMX128_INTRINSICS_
1800}
1801
1802//------------------------------------------------------------------------------
1803
1804inline XMVECTOR XMVectorNotEqualInt
1805(
1806 FXMVECTOR V1,
1807 FXMVECTOR V2
1808)
1809{
1810#if defined(_XM_NO_INTRINSICS_)
1811
1812 XMVECTOR Control;
1813 Control.vector4_u32[0] = (V1.vector4_u32[0] != V2.vector4_u32[0]) ? 0xFFFFFFFFU : 0;
1814 Control.vector4_u32[1] = (V1.vector4_u32[1] != V2.vector4_u32[1]) ? 0xFFFFFFFFU : 0;
1815 Control.vector4_u32[2] = (V1.vector4_u32[2] != V2.vector4_u32[2]) ? 0xFFFFFFFFU : 0;
1816 Control.vector4_u32[3] = (V1.vector4_u32[3] != V2.vector4_u32[3]) ? 0xFFFFFFFFU : 0;
1817 return Control;
1818
1819#elif defined(_XM_ARM_NEON_INTRINSICS_)
1820 return vmvnq_u32(vceqq_u32(V1, V2));
1821#elif defined(_XM_SSE_INTRINSICS_)
1822 __m128i V = _mm_cmpeq_epi32( _mm_castps_si128(V1),_mm_castps_si128(V2) );
1823 return _mm_xor_ps(reinterpret_cast<__m128 *>(&V)[0],g_XMNegOneMask);
1824#else // _XM_VMX128_INTRINSICS_
1825#endif // _XM_VMX128_INTRINSICS_
1826}
1827
1828//------------------------------------------------------------------------------
1829
1830inline XMVECTOR XMVectorGreater
1831(
1832 FXMVECTOR V1,
1833 FXMVECTOR V2
1834)
1835{
1836#if defined(_XM_NO_INTRINSICS_)
1837
1838 XMVECTOR Control;
1839 Control.vector4_u32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
1840 Control.vector4_u32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
1841 Control.vector4_u32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
1842 Control.vector4_u32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
1843 return Control;
1844
1845#elif defined(_XM_ARM_NEON_INTRINSICS_)
1846 return vcgtq_f32( V1, V2 );
1847#elif defined(_XM_SSE_INTRINSICS_)
1848 return _mm_cmpgt_ps( V1, V2 );
1849#else // _XM_VMX128_INTRINSICS_
1850#endif // _XM_VMX128_INTRINSICS_
1851}
1852
1853//------------------------------------------------------------------------------
1854
1855_Use_decl_annotations_
1856inline XMVECTOR XMVectorGreaterR
1857(
1858 uint32_t* pCR,
1859 FXMVECTOR V1,
1860 FXMVECTOR V2
1861)
1862{
1863 assert( pCR != NULL );
1864#if defined(_XM_NO_INTRINSICS_)
1865
1866 uint32_t ux = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
1867 uint32_t uy = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
1868 uint32_t uz = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
1869 uint32_t uw = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
1870 uint32_t CR = 0;
1871 if (ux&uy&uz&uw)
1872 {
1873 // All elements are greater
1874 CR = XM_CRMASK_CR6TRUE;
1875 }
1876 else if (!(ux|uy|uz|uw))
1877 {
1878 // All elements are not greater
1879 CR = XM_CRMASK_CR6FALSE;
1880 }
1881 *pCR = CR;
1882
1883 XMVECTOR Control;
1884 Control.vector4_u32[0] = ux;
1885 Control.vector4_u32[1] = uy;
1886 Control.vector4_u32[2] = uz;
1887 Control.vector4_u32[3] = uw;
1888 return Control;
1889
1890#elif defined(_XM_ARM_NEON_INTRINSICS_)
1891 __n128 vResult = vcgtq_f32( V1, V2 );
1892 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
1893 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
1894 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
1895 uint32_t CR = 0;
1896 if ( r == 0xFFFFFFFFU )
1897 {
1898 // All elements are greater
1899 CR = XM_CRMASK_CR6TRUE;
1900 }
1901 else if ( !r )
1902 {
1903 // All elements are not greater
1904 CR = XM_CRMASK_CR6FALSE;
1905 }
1906 *pCR = CR;
1907 return vResult;
1908#elif defined(_XM_SSE_INTRINSICS_)
1909 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
1910 uint32_t CR = 0;
1911 int iTest = _mm_movemask_ps(vTemp);
1912 if (iTest==0xf)
1913 {
1914 CR = XM_CRMASK_CR6TRUE;
1915 }
1916 else if (!iTest)
1917 {
1918 // All elements are not greater
1919 CR = XM_CRMASK_CR6FALSE;
1920 }
1921 *pCR = CR;
1922 return vTemp;
1923#else // _XM_VMX128_INTRINSICS_
1924#endif // _XM_VMX128_INTRINSICS_
1925}
1926
1927//------------------------------------------------------------------------------
1928
1929inline XMVECTOR XMVectorGreaterOrEqual
1930(
1931 FXMVECTOR V1,
1932 FXMVECTOR V2
1933)
1934{
1935#if defined(_XM_NO_INTRINSICS_)
1936
1937 XMVECTOR Control;
1938 Control.vector4_u32[0] = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
1939 Control.vector4_u32[1] = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
1940 Control.vector4_u32[2] = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
1941 Control.vector4_u32[3] = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
1942 return Control;
1943
1944#elif defined(_XM_ARM_NEON_INTRINSICS_)
1945 return vcgeq_f32( V1, V2 );
1946#elif defined(_XM_SSE_INTRINSICS_)
1947 return _mm_cmpge_ps( V1, V2 );
1948#else // _XM_VMX128_INTRINSICS_
1949#endif // _XM_VMX128_INTRINSICS_
1950}
1951
1952//------------------------------------------------------------------------------
1953
1954_Use_decl_annotations_
1955inline XMVECTOR XMVectorGreaterOrEqualR
1956(
1957 uint32_t* pCR,
1958 FXMVECTOR V1,
1959 FXMVECTOR V2
1960)
1961{
1962 assert( pCR != NULL );
1963#if defined(_XM_NO_INTRINSICS_)
1964
1965 uint32_t ux = (V1.vector4_f32[0] >= V2.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
1966 uint32_t uy = (V1.vector4_f32[1] >= V2.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
1967 uint32_t uz = (V1.vector4_f32[2] >= V2.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
1968 uint32_t uw = (V1.vector4_f32[3] >= V2.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
1969 uint32_t CR = 0;
1970 if (ux&uy&uz&uw)
1971 {
1972 // All elements are greater
1973 CR = XM_CRMASK_CR6TRUE;
1974 }
1975 else if (!(ux|uy|uz|uw))
1976 {
1977 // All elements are not greater
1978 CR = XM_CRMASK_CR6FALSE;
1979 }
1980 *pCR = CR;
1981
1982 XMVECTOR Control;
1983 Control.vector4_u32[0] = ux;
1984 Control.vector4_u32[1] = uy;
1985 Control.vector4_u32[2] = uz;
1986 Control.vector4_u32[3] = uw;
1987 return Control;
1988
1989#elif defined(_XM_ARM_NEON_INTRINSICS_)
1990 __n128 vResult = vcgeq_f32( V1, V2 );
1991 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
1992 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
1993 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
1994 uint32_t CR = 0;
1995 if ( r == 0xFFFFFFFFU )
1996 {
1997 // All elements are greater or equal
1998 CR = XM_CRMASK_CR6TRUE;
1999 }
2000 else if ( !r )
2001 {
2002 // All elements are not greater or equal
2003 CR = XM_CRMASK_CR6FALSE;
2004 }
2005 *pCR = CR;
2006 return vResult;
2007#elif defined(_XM_SSE_INTRINSICS_)
2008 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
2009 uint32_t CR = 0;
2010 int iTest = _mm_movemask_ps(vTemp);
2011 if (iTest==0xf)
2012 {
2013 CR = XM_CRMASK_CR6TRUE;
2014 }
2015 else if (!iTest)
2016 {
2017 // All elements are not greater
2018 CR = XM_CRMASK_CR6FALSE;
2019 }
2020 *pCR = CR;
2021 return vTemp;
2022#else // _XM_VMX128_INTRINSICS_
2023#endif // _XM_VMX128_INTRINSICS_
2024}
2025
2026//------------------------------------------------------------------------------
2027
2028inline XMVECTOR XMVectorLess
2029(
2030 FXMVECTOR V1,
2031 FXMVECTOR V2
2032)
2033{
2034#if defined(_XM_NO_INTRINSICS_)
2035
2036 XMVECTOR Control;
2037 Control.vector4_u32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
2038 Control.vector4_u32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
2039 Control.vector4_u32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
2040 Control.vector4_u32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
2041 return Control;
2042
2043#elif defined(_XM_ARM_NEON_INTRINSICS_)
2044 return vcltq_f32( V1, V2 );
2045#elif defined(_XM_SSE_INTRINSICS_)
2046 return _mm_cmplt_ps( V1, V2 );
2047#else // _XM_VMX128_INTRINSICS_
2048#endif // _XM_VMX128_INTRINSICS_
2049}
2050
2051//------------------------------------------------------------------------------
2052
2053inline XMVECTOR XMVectorLessOrEqual
2054(
2055 FXMVECTOR V1,
2056 FXMVECTOR V2
2057)
2058{
2059#if defined(_XM_NO_INTRINSICS_)
2060
2061 XMVECTOR Control;
2062 Control.vector4_u32[0] = (V1.vector4_f32[0] <= V2.vector4_f32[0]) ? 0xFFFFFFFF : 0;
2063 Control.vector4_u32[1] = (V1.vector4_f32[1] <= V2.vector4_f32[1]) ? 0xFFFFFFFF : 0;
2064 Control.vector4_u32[2] = (V1.vector4_f32[2] <= V2.vector4_f32[2]) ? 0xFFFFFFFF : 0;
2065 Control.vector4_u32[3] = (V1.vector4_f32[3] <= V2.vector4_f32[3]) ? 0xFFFFFFFF : 0;
2066 return Control;
2067
2068#elif defined(_XM_ARM_NEON_INTRINSICS_)
2069 return vcleq_f32( V1, V2 );
2070#elif defined(_XM_SSE_INTRINSICS_)
2071 return _mm_cmple_ps( V1, V2 );
2072#else // _XM_VMX128_INTRINSICS_
2073#endif // _XM_VMX128_INTRINSICS_
2074}
2075
2076//------------------------------------------------------------------------------
2077
2078inline XMVECTOR XMVectorInBounds
2079(
2080 FXMVECTOR V,
2081 FXMVECTOR Bounds
2082)
2083{
2084#if defined(_XM_NO_INTRINSICS_)
2085
2086 XMVECTOR Control;
2087 Control.vector4_u32[0] = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFF : 0;
2088 Control.vector4_u32[1] = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFF : 0;
2089 Control.vector4_u32[2] = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFF : 0;
2090 Control.vector4_u32[3] = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFF : 0;
2091 return Control;
2092
2093#elif defined(_XM_ARM_NEON_INTRINSICS_)
2094 // Test if less than or equal
2095 XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
2096 // Negate the bounds
2097 XMVECTOR vTemp2 = vnegq_f32(Bounds);
2098 // Test if greater or equal (Reversed)
2099 vTemp2 = vcleq_f32(vTemp2,V);
2100 // Blend answers
2101 vTemp1 = vandq_u32(vTemp1,vTemp2);
2102 return vTemp1;
2103#elif defined(_XM_SSE_INTRINSICS_)
2104 // Test if less than or equal
2105 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
2106 // Negate the bounds
2107 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
2108 // Test if greater or equal (Reversed)
2109 vTemp2 = _mm_cmple_ps(vTemp2,V);
2110 // Blend answers
2111 vTemp1 = _mm_and_ps(vTemp1,vTemp2);
2112 return vTemp1;
2113#else // _XM_VMX128_INTRINSICS_
2114#endif // _XM_VMX128_INTRINSICS_
2115}
2116
2117//------------------------------------------------------------------------------
2118
2119_Use_decl_annotations_
2120inline XMVECTOR XMVectorInBoundsR
2121(
2122 uint32_t* pCR,
2123 FXMVECTOR V,
2124 FXMVECTOR Bounds
2125)
2126{
2127 assert( pCR != NULL );
2128#if defined(_XM_NO_INTRINSICS_)
2129
2130 uint32_t ux = (V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
2131 uint32_t uy = (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
2132 uint32_t uz = (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
2133 uint32_t uw = (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
2134
2135 uint32_t CR = 0;
2136 if (ux&uy&uz&uw)
2137 {
2138 // All elements are in bounds
2139 CR = XM_CRMASK_CR6BOUNDS;
2140 }
2141 *pCR = CR;
2142
2143 XMVECTOR Control;
2144 Control.vector4_u32[0] = ux;
2145 Control.vector4_u32[1] = uy;
2146 Control.vector4_u32[2] = uz;
2147 Control.vector4_u32[3] = uw;
2148 return Control;
2149
2150#elif defined(_XM_ARM_NEON_INTRINSICS_)
2151 // Test if less than or equal
2152 XMVECTOR vTemp1 = vcleq_f32(V,Bounds);
2153 // Negate the bounds
2154 XMVECTOR vTemp2 = vnegq_f32(Bounds);
2155 // Test if greater or equal (Reversed)
2156 vTemp2 = vcleq_f32(vTemp2,V);
2157 // Blend answers
2158 vTemp1 = vandq_u32(vTemp1,vTemp2);
2159 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
2160 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
2161 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
2162 uint32_t CR = 0;
2163 if ( r == 0xFFFFFFFFU )
2164 {
2165 // All elements are in bounds
2166 CR = XM_CRMASK_CR6BOUNDS;
2167 }
2168 *pCR = CR;
2169 return vTemp1;
2170#elif defined(_XM_SSE_INTRINSICS_)
2171 // Test if less than or equal
2172 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
2173 // Negate the bounds
2174 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
2175 // Test if greater or equal (Reversed)
2176 vTemp2 = _mm_cmple_ps(vTemp2,V);
2177 // Blend answers
2178 vTemp1 = _mm_and_ps(vTemp1,vTemp2);
2179
2180 uint32_t CR = 0;
2181 if (_mm_movemask_ps(vTemp1)==0xf) {
2182 // All elements are in bounds
2183 CR = XM_CRMASK_CR6BOUNDS;
2184 }
2185 *pCR = CR;
2186 return vTemp1;
2187#else // _XM_VMX128_INTRINSICS_
2188#endif // _XM_VMX128_INTRINSICS_
2189}
2190
2191//------------------------------------------------------------------------------
2192
2193inline XMVECTOR XMVectorIsNaN
2194(
2195 FXMVECTOR V
2196)
2197{
2198#if defined(_XM_NO_INTRINSICS_)
2199
2200 XMVECTOR Control;
2201 Control.vector4_u32[0] = XMISNAN(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
2202 Control.vector4_u32[1] = XMISNAN(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
2203 Control.vector4_u32[2] = XMISNAN(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
2204 Control.vector4_u32[3] = XMISNAN(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
2205 return Control;
2206
2207#elif defined(_XM_ARM_NEON_INTRINSICS_)
2208 // Test against itself. NaN is always not equal
2209 __n128 vTempNan = vceqq_f32( V, V );
2210 // Flip results
2211 return vmvnq_u32( vTempNan );
2212#elif defined(_XM_SSE_INTRINSICS_)
2213 // Test against itself. NaN is always not equal
2214 return _mm_cmpneq_ps(V,V);
2215#else // _XM_VMX128_INTRINSICS_
2216#endif // _XM_VMX128_INTRINSICS_
2217}
2218
2219//------------------------------------------------------------------------------
2220
2221inline XMVECTOR XMVectorIsInfinite
2222(
2223 FXMVECTOR V
2224)
2225{
2226#if defined(_XM_NO_INTRINSICS_)
2227
2228 XMVECTOR Control;
2229 Control.vector4_u32[0] = XMISINF(V.vector4_f32[0]) ? 0xFFFFFFFFU : 0;
2230 Control.vector4_u32[1] = XMISINF(V.vector4_f32[1]) ? 0xFFFFFFFFU : 0;
2231 Control.vector4_u32[2] = XMISINF(V.vector4_f32[2]) ? 0xFFFFFFFFU : 0;
2232 Control.vector4_u32[3] = XMISINF(V.vector4_f32[3]) ? 0xFFFFFFFFU : 0;
2233 return Control;
2234
2235#elif defined(_XM_ARM_NEON_INTRINSICS_)
2236 // Mask off the sign bit
2237 __n128 vTemp = vandq_u32(V,g_XMAbsMask);
2238 // Compare to infinity
2239 vTemp = vceqq_f32(vTemp,g_XMInfinity);
2240 // If any are infinity, the signs are true.
2241 return vTemp;
2242#elif defined(_XM_SSE_INTRINSICS_)
2243 // Mask off the sign bit
2244 __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
2245 // Compare to infinity
2246 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
2247 // If any are infinity, the signs are true.
2248 return vTemp;
2249#else // _XM_VMX128_INTRINSICS_
2250#endif // _XM_VMX128_INTRINSICS_
2251}
2252
2253//------------------------------------------------------------------------------
2254// Rounding and clamping operations
2255//------------------------------------------------------------------------------
2256
2257//------------------------------------------------------------------------------
2258
2259inline XMVECTOR XMVectorMin
2260(
2261 FXMVECTOR V1,
2262 FXMVECTOR V2
2263)
2264{
2265#if defined(_XM_NO_INTRINSICS_)
2266
2267 XMVECTOR Result;
2268 Result.vector4_f32[0] = (V1.vector4_f32[0] < V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
2269 Result.vector4_f32[1] = (V1.vector4_f32[1] < V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
2270 Result.vector4_f32[2] = (V1.vector4_f32[2] < V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
2271 Result.vector4_f32[3] = (V1.vector4_f32[3] < V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
2272 return Result;
2273
2274#elif defined(_XM_ARM_NEON_INTRINSICS_)
2275 return vminq_f32( V1, V2 );
2276#elif defined(_XM_SSE_INTRINSICS_)
2277 return _mm_min_ps( V1, V2 );
2278#else // _XM_VMX128_INTRINSICS_
2279#endif // _XM_VMX128_INTRINSICS_
2280}
2281
2282//------------------------------------------------------------------------------
2283
2284inline XMVECTOR XMVectorMax
2285(
2286 FXMVECTOR V1,
2287 FXMVECTOR V2
2288)
2289{
2290#if defined(_XM_NO_INTRINSICS_)
2291
2292 XMVECTOR Result;
2293 Result.vector4_f32[0] = (V1.vector4_f32[0] > V2.vector4_f32[0]) ? V1.vector4_f32[0] : V2.vector4_f32[0];
2294 Result.vector4_f32[1] = (V1.vector4_f32[1] > V2.vector4_f32[1]) ? V1.vector4_f32[1] : V2.vector4_f32[1];
2295 Result.vector4_f32[2] = (V1.vector4_f32[2] > V2.vector4_f32[2]) ? V1.vector4_f32[2] : V2.vector4_f32[2];
2296 Result.vector4_f32[3] = (V1.vector4_f32[3] > V2.vector4_f32[3]) ? V1.vector4_f32[3] : V2.vector4_f32[3];
2297 return Result;
2298
2299#elif defined(_XM_ARM_NEON_INTRINSICS_)
2300 return vmaxq_f32( V1, V2 );
2301#elif defined(_XM_SSE_INTRINSICS_)
2302 return _mm_max_ps( V1, V2 );
2303#else // _XM_VMX128_INTRINSICS_
2304#endif // _XM_VMX128_INTRINSICS_
2305}
2306
2307//------------------------------------------------------------------------------
2308
2309inline XMVECTOR XMVectorRound
2310(
2311 FXMVECTOR V
2312)
2313{
2314#if defined(_XM_NO_INTRINSICS_)
2315
2316 const XMVECTOR Zero = XMVectorZero();
2317 const XMVECTOR BiasPos = XMVectorReplicate(0.5f);
2318 const XMVECTOR BiasNeg = XMVectorReplicate(-0.5f);
2319
2320 XMVECTOR Bias = XMVectorLess(V, Zero);
2321 Bias = XMVectorSelect(BiasPos, BiasNeg, Bias);
2322 XMVECTOR Result = XMVectorAdd(V, Bias);
2323 Result = XMVectorTruncate(Result);
2324
2325 return Result;
2326
2327#elif defined(_XM_ARM_NEON_INTRINSICS_)
2328 __n128 vTest = vabsq_f32( V );
2329 vTest = vcltq_f32( vTest, g_XMNoFraction );
2330
2331 __n128 Bias = vcltq_f32( V, vdupq_n_u32(0) );
2332
2333 __n128 BiasPos = vdupq_n_f32( 0.5f );
2334 __n128 BiasNeg = vdupq_n_f32( -0.5f );
2335 Bias = vbslq_f32( Bias, BiasNeg, BiasPos );
2336 __n128 V0 = vaddq_f32( V, Bias );
2337 __n128 vInt = vcvtq_s32_f32( V0 );
2338 __n128 vResult = vcvtq_f32_s32( vInt );
2339
2340 // All numbers less than 8388608 will use the round to int
2341 // All others, use the ORIGINAL value
2342 return vbslq_f32( vTest, vResult, V );
2343#elif defined(_XM_SSE_INTRINSICS_)
2344 // To handle NAN, INF and numbers greater than 8388608, use masking
2345 // Get the abs value
2346 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
2347 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
2348 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
2349 // Convert to int and back to float for rounding
2350 __m128i vInt = _mm_cvtps_epi32(V);
2351 // Convert back to floats
2352 XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
2353 // All numbers less than 8388608 will use the round to int
2354 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2355 // All others, use the ORIGINAL value
2356 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
2357 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2358 return vResult;
2359#else // _XM_VMX128_INTRINSICS_
2360#endif // _XM_VMX128_INTRINSICS_
2361}
2362
2363//------------------------------------------------------------------------------
2364
2365inline XMVECTOR XMVectorTruncate
2366(
2367 FXMVECTOR V
2368)
2369{
2370#if defined(_XM_NO_INTRINSICS_)
2371 XMVECTOR Result;
2372 uint32_t i;
2373
2374 // Avoid C4701
2375 Result.vector4_f32[0] = 0.0f;
2376
2377 for (i = 0; i < 4; i++)
2378 {
2379 if (XMISNAN(V.vector4_f32[i]))
2380 {
2381 Result.vector4_u32[i] = 0x7FC00000;
2382 }
2383 else if (fabsf(V.vector4_f32[i]) < 8388608.0f)
2384 {
2385 Result.vector4_f32[i] = (float)((int32_t)V.vector4_f32[i]);
2386 }
2387 else
2388 {
2389 Result.vector4_f32[i] = V.vector4_f32[i];
2390 }
2391 }
2392 return Result;
2393
2394#elif defined(_XM_ARM_NEON_INTRINSICS_)
2395 __n128 vTest = vabsq_f32( V );
2396 vTest = vcltq_f32( vTest, g_XMNoFraction );
2397
2398 __n128 vInt = vcvtq_s32_f32( V );
2399 __n128 vResult = vcvtq_f32_s32( vInt );
2400
2401 // All numbers less than 8388608 will use the round to int
2402 // All others, use the ORIGINAL value
2403 return vbslq_f32( vTest, vResult, V );
2404#elif defined(_XM_SSE_INTRINSICS_)
2405 // To handle NAN, INF and numbers greater than 8388608, use masking
2406 // Get the abs value
2407 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
2408 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
2409 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
2410 // Convert to int and back to float for rounding with truncation
2411 __m128i vInt = _mm_cvttps_epi32(V);
2412 // Convert back to floats
2413 XMVECTOR vResult = _mm_cvtepi32_ps(vInt);
2414 // All numbers less than 8388608 will use the round to int
2415 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2416 // All others, use the ORIGINAL value
2417 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
2418 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2419 return vResult;
2420#else // _XM_VMX128_INTRINSICS_
2421#endif // _XM_VMX128_INTRINSICS_
2422}
2423
2424//------------------------------------------------------------------------------
2425
2426inline XMVECTOR XMVectorFloor
2427(
2428 FXMVECTOR V
2429)
2430{
2431#if defined(_XM_NO_INTRINSICS_)
2432
2433 XMVECTOR vResult = {
2434 floorf(V.vector4_f32[0]),
2435 floorf(V.vector4_f32[1]),
2436 floorf(V.vector4_f32[2]),
2437 floorf(V.vector4_f32[3])
2438 };
2439 return vResult;
2440
2441#elif defined(_XM_ARM_NEON_INTRINSICS_)
2442 __n128 V0 = vsubq_f32( V, vdupq_n_u32(0x3EFFFFA0) );
2443 return XMVectorRound(V0);
2444#elif defined(_XM_SSE_INTRINSICS_)
2445 // To handle NAN, INF and numbers greater than 8388608, use masking
2446 // Get the abs value
2447 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
2448 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
2449 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
2450 // Convert to int and back to float for rounding
2451 XMVECTOR vResult = _mm_sub_ps(V,g_XMOneHalfMinusEpsilon);
2452 __m128i vInt = _mm_cvtps_epi32(vResult);
2453 // Convert back to floats
2454 vResult = _mm_cvtepi32_ps(vInt);
2455 // All numbers less than 8388608 will use the round to int
2456 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2457 // All others, use the ORIGINAL value
2458 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
2459 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2460 return vResult;
2461#else // _XM_VMX128_INTRINSICS_
2462#endif // _XM_VMX128_INTRINSICS_
2463}
2464
2465//------------------------------------------------------------------------------
2466
2467inline XMVECTOR XMVectorCeiling
2468(
2469 FXMVECTOR V
2470)
2471{
2472#if defined(_XM_NO_INTRINSICS_)
2473 XMVECTOR vResult = {
2474 ceilf(V.vector4_f32[0]),
2475 ceilf(V.vector4_f32[1]),
2476 ceilf(V.vector4_f32[2]),
2477 ceilf(V.vector4_f32[3])
2478 };
2479 return vResult;
2480
2481#elif defined(_XM_ARM_NEON_INTRINSICS_)
2482 __n128 V0 = vaddq_f32( V, vdupq_n_u32(0x3EFFFFA0) );
2483 return XMVectorRound(V0);
2484#elif defined(_XM_SSE_INTRINSICS_)
2485 // To handle NAN, INF and numbers greater than 8388608, use masking
2486 // Get the abs value
2487 __m128i vTest = _mm_and_si128(_mm_castps_si128(V),g_XMAbsMask);
2488 // Test for greater than 8388608 (All floats with NO fractionals, NAN and INF
2489 vTest = _mm_cmplt_epi32(vTest,g_XMNoFraction);
2490 // Convert to int and back to float for rounding
2491 XMVECTOR vResult = _mm_add_ps(V,g_XMOneHalfMinusEpsilon);
2492 __m128i vInt = _mm_cvtps_epi32(vResult);
2493 // Convert back to floats
2494 vResult = _mm_cvtepi32_ps(vInt);
2495 // All numbers less than 8388608 will use the round to int
2496 vResult = _mm_and_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2497 // All others, use the ORIGINAL value
2498 vTest = _mm_andnot_si128(vTest,_mm_castps_si128(V));
2499 vResult = _mm_or_ps(vResult,reinterpret_cast<const XMVECTOR *>(&vTest)[0]);
2500 return vResult;
2501#else // _XM_VMX128_INTRINSICS_
2502#endif // _XM_VMX128_INTRINSICS_
2503}
2504
2505//------------------------------------------------------------------------------
2506
2507inline XMVECTOR XMVectorClamp
2508(
2509 FXMVECTOR V,
2510 FXMVECTOR Min,
2511 FXMVECTOR Max
2512)
2513{
2514 assert(XMVector4LessOrEqual(Min, Max));
2515
2516#if defined(_XM_NO_INTRINSICS_)
2517
2518 XMVECTOR Result;
2519 Result = XMVectorMax(Min, V);
2520 Result = XMVectorMin(Max, Result);
2521 return Result;
2522
2523#elif defined(_XM_ARM_NEON_INTRINSICS_)
2524 XMVECTOR vResult;
2525 vResult = vmaxq_f32(Min,V);
2526 vResult = vminq_f32(vResult,Max);
2527 return vResult;
2528#elif defined(_XM_SSE_INTRINSICS_)
2529 XMVECTOR vResult;
2530 vResult = _mm_max_ps(Min,V);
2531 vResult = _mm_min_ps(vResult,Max);
2532 return vResult;
2533#else // _XM_VMX128_INTRINSICS_
2534#endif // _XM_VMX128_INTRINSICS_
2535}
2536
2537//------------------------------------------------------------------------------
2538
2539inline XMVECTOR XMVectorSaturate
2540(
2541 FXMVECTOR V
2542)
2543{
2544#if defined(_XM_NO_INTRINSICS_)
2545
2546 const XMVECTOR Zero = XMVectorZero();
2547
2548 return XMVectorClamp(V, Zero, g_XMOne.v);
2549
2550#elif defined(_XM_ARM_NEON_INTRINSICS_)
2551 // Set <0 to 0
2552 XMVECTOR vResult = vmaxq_f32(V, vdupq_n_u32(0) );
2553 // Set>1 to 1
2554 return vminq_f32(vResult, vdupq_n_f32(1.0f) );
2555#elif defined(_XM_SSE_INTRINSICS_)
2556 // Set <0 to 0
2557 XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
2558 // Set>1 to 1
2559 return _mm_min_ps(vResult,g_XMOne);
2560#else // _XM_VMX128_INTRINSICS_
2561#endif // _XM_VMX128_INTRINSICS_
2562}
2563
2564//------------------------------------------------------------------------------
2565// Bitwise logical operations
2566//------------------------------------------------------------------------------
2567
2568inline XMVECTOR XMVectorAndInt
2569(
2570 FXMVECTOR V1,
2571 FXMVECTOR V2
2572)
2573{
2574#if defined(_XM_NO_INTRINSICS_)
2575
2576 XMVECTOR Result;
2577 Result.vector4_u32[0] = V1.vector4_u32[0] & V2.vector4_u32[0];
2578 Result.vector4_u32[1] = V1.vector4_u32[1] & V2.vector4_u32[1];
2579 Result.vector4_u32[2] = V1.vector4_u32[2] & V2.vector4_u32[2];
2580 Result.vector4_u32[3] = V1.vector4_u32[3] & V2.vector4_u32[3];
2581 return Result;
2582
2583#elif defined(_XM_ARM_NEON_INTRINSICS_)
2584 return vandq_u32(V1,V2);
2585#elif defined(_XM_SSE_INTRINSICS_)
2586 return _mm_and_ps(V1,V2);
2587#else // _XM_VMX128_INTRINSICS_
2588#endif // _XM_VMX128_INTRINSICS_
2589}
2590
2591//------------------------------------------------------------------------------
2592
2593inline XMVECTOR XMVectorAndCInt
2594(
2595 FXMVECTOR V1,
2596 FXMVECTOR V2
2597)
2598{
2599#if defined(_XM_NO_INTRINSICS_)
2600
2601 XMVECTOR Result;
2602 Result.vector4_u32[0] = V1.vector4_u32[0] & ~V2.vector4_u32[0];
2603 Result.vector4_u32[1] = V1.vector4_u32[1] & ~V2.vector4_u32[1];
2604 Result.vector4_u32[2] = V1.vector4_u32[2] & ~V2.vector4_u32[2];
2605 Result.vector4_u32[3] = V1.vector4_u32[3] & ~V2.vector4_u32[3];
2606 return Result;
2607
2608#elif defined(_XM_ARM_NEON_INTRINSICS_)
2609 return vbicq_u32(V1,V2);
2610#elif defined(_XM_SSE_INTRINSICS_)
2611 __m128i V = _mm_andnot_si128( _mm_castps_si128(V2), _mm_castps_si128(V1) );
2612 return reinterpret_cast<__m128 *>(&V)[0];
2613#else // _XM_VMX128_INTRINSICS_
2614#endif // _XM_VMX128_INTRINSICS_
2615}
2616
2617//------------------------------------------------------------------------------
2618
2619inline XMVECTOR XMVectorOrInt
2620(
2621 FXMVECTOR V1,
2622 FXMVECTOR V2
2623)
2624{
2625#if defined(_XM_NO_INTRINSICS_)
2626
2627 XMVECTOR Result;
2628 Result.vector4_u32[0] = V1.vector4_u32[0] | V2.vector4_u32[0];
2629 Result.vector4_u32[1] = V1.vector4_u32[1] | V2.vector4_u32[1];
2630 Result.vector4_u32[2] = V1.vector4_u32[2] | V2.vector4_u32[2];
2631 Result.vector4_u32[3] = V1.vector4_u32[3] | V2.vector4_u32[3];
2632 return Result;
2633
2634#elif defined(_XM_ARM_NEON_INTRINSICS_)
2635 return vorrq_u32(V1,V2);
2636#elif defined(_XM_SSE_INTRINSICS_)
2637 __m128i V = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
2638 return reinterpret_cast<__m128 *>(&V)[0];
2639#else // _XM_VMX128_INTRINSICS_
2640#endif // _XM_VMX128_INTRINSICS_
2641}
2642
2643//------------------------------------------------------------------------------
2644
2645inline XMVECTOR XMVectorNorInt
2646(
2647 FXMVECTOR V1,
2648 FXMVECTOR V2
2649)
2650{
2651#if defined(_XM_NO_INTRINSICS_)
2652
2653 XMVECTOR Result;
2654 Result.vector4_u32[0] = ~(V1.vector4_u32[0] | V2.vector4_u32[0]);
2655 Result.vector4_u32[1] = ~(V1.vector4_u32[1] | V2.vector4_u32[1]);
2656 Result.vector4_u32[2] = ~(V1.vector4_u32[2] | V2.vector4_u32[2]);
2657 Result.vector4_u32[3] = ~(V1.vector4_u32[3] | V2.vector4_u32[3]);
2658 return Result;
2659
2660#elif defined(_XM_ARM_NEON_INTRINSICS_)
2661 __n128 Result = vorrq_u32(V1,V2);
2662 return vbicq_u32(g_XMNegOneMask, Result);
2663#elif defined(_XM_SSE_INTRINSICS_)
2664 __m128i Result;
2665 Result = _mm_or_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
2666 Result = _mm_andnot_si128( Result,g_XMNegOneMask);
2667 return reinterpret_cast<__m128 *>(&Result)[0];
2668#else // _XM_VMX128_INTRINSICS_
2669#endif // _XM_VMX128_INTRINSICS_
2670}
2671
2672//------------------------------------------------------------------------------
2673
2674inline XMVECTOR XMVectorXorInt
2675(
2676 FXMVECTOR V1,
2677 FXMVECTOR V2
2678)
2679{
2680#if defined(_XM_NO_INTRINSICS_)
2681
2682 XMVECTOR Result;
2683 Result.vector4_u32[0] = V1.vector4_u32[0] ^ V2.vector4_u32[0];
2684 Result.vector4_u32[1] = V1.vector4_u32[1] ^ V2.vector4_u32[1];
2685 Result.vector4_u32[2] = V1.vector4_u32[2] ^ V2.vector4_u32[2];
2686 Result.vector4_u32[3] = V1.vector4_u32[3] ^ V2.vector4_u32[3];
2687 return Result;
2688
2689#elif defined(_XM_ARM_NEON_INTRINSICS_)
2690 return veorq_u32(V1,V2);
2691#elif defined(_XM_SSE_INTRINSICS_)
2692 __m128i V = _mm_xor_si128( _mm_castps_si128(V1), _mm_castps_si128(V2) );
2693 return reinterpret_cast<__m128 *>(&V)[0];
2694#else // _XM_VMX128_INTRINSICS_
2695#endif // _XM_VMX128_INTRINSICS_
2696}
2697
2698//------------------------------------------------------------------------------
2699// Computation operations
2700//------------------------------------------------------------------------------
2701
2702//------------------------------------------------------------------------------
2703
2704inline XMVECTOR XMVectorNegate
2705(
2706 FXMVECTOR V
2707)
2708{
2709#if defined(_XM_NO_INTRINSICS_)
2710
2711 XMVECTOR Result;
2712 Result.vector4_f32[0] = -V.vector4_f32[0];
2713 Result.vector4_f32[1] = -V.vector4_f32[1];
2714 Result.vector4_f32[2] = -V.vector4_f32[2];
2715 Result.vector4_f32[3] = -V.vector4_f32[3];
2716 return Result;
2717
2718#elif defined(_XM_ARM_NEON_INTRINSICS_)
2719 return vnegq_f32(V);
2720#elif defined(_XM_SSE_INTRINSICS_)
2721 XMVECTOR Z;
2722
2723 Z = _mm_setzero_ps();
2724
2725 return _mm_sub_ps( Z, V );
2726#else // _XM_VMX128_INTRINSICS_
2727#endif // _XM_VMX128_INTRINSICS_
2728}
2729
2730//------------------------------------------------------------------------------
2731
2732inline XMVECTOR XMVectorAdd
2733(
2734 FXMVECTOR V1,
2735 FXMVECTOR V2
2736)
2737{
2738#if defined(_XM_NO_INTRINSICS_)
2739
2740 XMVECTOR Result;
2741 Result.vector4_f32[0] = V1.vector4_f32[0] + V2.vector4_f32[0];
2742 Result.vector4_f32[1] = V1.vector4_f32[1] + V2.vector4_f32[1];
2743 Result.vector4_f32[2] = V1.vector4_f32[2] + V2.vector4_f32[2];
2744 Result.vector4_f32[3] = V1.vector4_f32[3] + V2.vector4_f32[3];
2745 return Result;
2746
2747#elif defined(_XM_ARM_NEON_INTRINSICS_)
2748 return vaddq_f32( V1, V2 );
2749#elif defined(_XM_SSE_INTRINSICS_)
2750 return _mm_add_ps( V1, V2 );
2751#else // _XM_VMX128_INTRINSICS_
2752#endif // _XM_VMX128_INTRINSICS_
2753}
2754
2755//------------------------------------------------------------------------------
2756
2757inline XMVECTOR XMVectorAddAngles
2758(
2759 FXMVECTOR V1,
2760 FXMVECTOR V2
2761)
2762{
2763#if defined(_XM_NO_INTRINSICS_)
2764
2765 const XMVECTOR Zero = XMVectorZero();
2766
2767 // Add the given angles together. If the range of V1 is such
2768 // that -Pi <= V1 < Pi and the range of V2 is such that
2769 // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
2770 // will be -Pi <= Result < Pi.
2771 XMVECTOR Result = XMVectorAdd(V1, V2);
2772
2773 XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
2774 XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
2775
2776 Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
2777 Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
2778
2779 Result = XMVectorAdd(Result, Offset);
2780
2781 return Result;
2782
2783#elif defined(_XM_ARM_NEON_INTRINSICS_)
2784 // Adjust the angles
2785 __n128 vResult = vaddq_f32(V1,V2);
2786 // Less than Pi?
2787 __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi);
2788 vOffset = vandq_u32(vOffset,g_XMTwoPi);
2789 // Add 2Pi to all entries less than -Pi
2790 vResult = vaddq_f32(vResult,vOffset);
2791 // Greater than or equal to Pi?
2792 vOffset = vcgeq_f32(vResult,g_XMPi);
2793 vOffset = vandq_u32(vOffset,g_XMTwoPi);
2794 // Sub 2Pi to all entries greater than Pi
2795 vResult = vsubq_f32(vResult,vOffset);
2796 return vResult;
2797#elif defined(_XM_SSE_INTRINSICS_)
2798 // Adjust the angles
2799 XMVECTOR vResult = _mm_add_ps(V1,V2);
2800 // Less than Pi?
2801 XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
2802 vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
2803 // Add 2Pi to all entries less than -Pi
2804 vResult = _mm_add_ps(vResult,vOffset);
2805 // Greater than or equal to Pi?
2806 vOffset = _mm_cmpge_ps(vResult,g_XMPi);
2807 vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
2808 // Sub 2Pi to all entries greater than Pi
2809 vResult = _mm_sub_ps(vResult,vOffset);
2810 return vResult;
2811#else // _XM_VMX128_INTRINSICS_
2812#endif // _XM_VMX128_INTRINSICS_
2813}
2814
2815//------------------------------------------------------------------------------
2816
2817inline XMVECTOR XMVectorSubtract
2818(
2819 FXMVECTOR V1,
2820 FXMVECTOR V2
2821)
2822{
2823#if defined(_XM_NO_INTRINSICS_)
2824
2825 XMVECTOR Result;
2826 Result.vector4_f32[0] = V1.vector4_f32[0] - V2.vector4_f32[0];
2827 Result.vector4_f32[1] = V1.vector4_f32[1] - V2.vector4_f32[1];
2828 Result.vector4_f32[2] = V1.vector4_f32[2] - V2.vector4_f32[2];
2829 Result.vector4_f32[3] = V1.vector4_f32[3] - V2.vector4_f32[3];
2830 return Result;
2831
2832#elif defined(_XM_ARM_NEON_INTRINSICS_)
2833 return vsubq_f32( V1, V2 );
2834#elif defined(_XM_SSE_INTRINSICS_)
2835 return _mm_sub_ps( V1, V2 );
2836#else // _XM_VMX128_INTRINSICS_
2837#endif // _XM_VMX128_INTRINSICS_
2838}
2839
2840//------------------------------------------------------------------------------
2841
2842inline XMVECTOR XMVectorSubtractAngles
2843(
2844 FXMVECTOR V1,
2845 FXMVECTOR V2
2846)
2847{
2848#if defined(_XM_NO_INTRINSICS_)
2849
2850 const XMVECTOR Zero = XMVectorZero();
2851
2852 // Subtract the given angles. If the range of V1 is such
2853 // that -Pi <= V1 < Pi and the range of V2 is such that
2854 // -2Pi <= V2 <= 2Pi, then the range of the resulting angle
2855 // will be -Pi <= Result < Pi.
2856 XMVECTOR Result = XMVectorSubtract(V1, V2);
2857
2858 XMVECTOR Mask = XMVectorLess(Result, g_XMNegativePi.v);
2859 XMVECTOR Offset = XMVectorSelect(Zero, g_XMTwoPi.v, Mask);
2860
2861 Mask = XMVectorGreaterOrEqual(Result, g_XMPi.v);
2862 Offset = XMVectorSelect(Offset, g_XMNegativeTwoPi.v, Mask);
2863
2864 Result = XMVectorAdd(Result, Offset);
2865
2866 return Result;
2867
2868#elif defined(_XM_ARM_NEON_INTRINSICS_)
2869 // Adjust the angles
2870 __n128 vResult = vsubq_f32(V1,V2);
2871 // Less than Pi?
2872 __n128 vOffset = vcltq_f32(vResult,g_XMNegativePi);
2873 vOffset = vandq_u32(vOffset,g_XMTwoPi);
2874 // Add 2Pi to all entries less than -Pi
2875 vResult = vaddq_f32(vResult,vOffset);
2876 // Greater than or equal to Pi?
2877 vOffset = vcgeq_f32(vResult,g_XMPi);
2878 vOffset = vandq_u32(vOffset,g_XMTwoPi);
2879 // Sub 2Pi to all entries greater than Pi
2880 vResult = vsubq_f32(vResult,vOffset);
2881 return vResult;
2882#elif defined(_XM_SSE_INTRINSICS_)
2883 // Adjust the angles
2884 XMVECTOR vResult = _mm_sub_ps(V1,V2);
2885 // Less than Pi?
2886 XMVECTOR vOffset = _mm_cmplt_ps(vResult,g_XMNegativePi);
2887 vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
2888 // Add 2Pi to all entries less than -Pi
2889 vResult = _mm_add_ps(vResult,vOffset);
2890 // Greater than or equal to Pi?
2891 vOffset = _mm_cmpge_ps(vResult,g_XMPi);
2892 vOffset = _mm_and_ps(vOffset,g_XMTwoPi);
2893 // Sub 2Pi to all entries greater than Pi
2894 vResult = _mm_sub_ps(vResult,vOffset);
2895 return vResult;
2896#else // _XM_VMX128_INTRINSICS_
2897#endif // _XM_VMX128_INTRINSICS_
2898}
2899
2900//------------------------------------------------------------------------------
2901
2902inline XMVECTOR XMVectorMultiply
2903(
2904 FXMVECTOR V1,
2905 FXMVECTOR V2
2906)
2907{
2908#if defined(_XM_NO_INTRINSICS_)
2909 XMVECTOR Result = {
2910 V1.vector4_f32[0] * V2.vector4_f32[0],
2911 V1.vector4_f32[1] * V2.vector4_f32[1],
2912 V1.vector4_f32[2] * V2.vector4_f32[2],
2913 V1.vector4_f32[3] * V2.vector4_f32[3]
2914 };
2915 return Result;
2916#elif defined(_XM_ARM_NEON_INTRINSICS_)
2917 return vmulq_f32( V1, V2 );
2918#elif defined(_XM_SSE_INTRINSICS_)
2919 return _mm_mul_ps( V1, V2 );
2920#else // _XM_VMX128_INTRINSICS_
2921#endif // _XM_VMX128_INTRINSICS_
2922}
2923
2924//------------------------------------------------------------------------------
2925
2926inline XMVECTOR XMVectorMultiplyAdd
2927(
2928 FXMVECTOR V1,
2929 FXMVECTOR V2,
2930 FXMVECTOR V3
2931)
2932{
2933#if defined(_XM_NO_INTRINSICS_)
2934 XMVECTOR vResult = {
2935 (V1.vector4_f32[0] * V2.vector4_f32[0]) + V3.vector4_f32[0],
2936 (V1.vector4_f32[1] * V2.vector4_f32[1]) + V3.vector4_f32[1],
2937 (V1.vector4_f32[2] * V2.vector4_f32[2]) + V3.vector4_f32[2],
2938 (V1.vector4_f32[3] * V2.vector4_f32[3]) + V3.vector4_f32[3]
2939 };
2940 return vResult;
2941
2942#elif defined(_XM_ARM_NEON_INTRINSICS_)
2943 return vmlaq_f32( V3, V1, V2 );
2944#elif defined(_XM_SSE_INTRINSICS_)
2945 XMVECTOR vResult = _mm_mul_ps( V1, V2 );
2946 return _mm_add_ps(vResult, V3 );
2947#else // _XM_VMX128_INTRINSICS_
2948#endif // _XM_VMX128_INTRINSICS_
2949}
2950
2951//------------------------------------------------------------------------------
2952
2953inline XMVECTOR XMVectorDivide
2954(
2955 FXMVECTOR V1,
2956 FXMVECTOR V2
2957)
2958{
2959#if defined(_XM_NO_INTRINSICS_)
2960 XMVECTOR Result;
2961 Result.vector4_f32[0] = V1.vector4_f32[0] / V2.vector4_f32[0];
2962 Result.vector4_f32[1] = V1.vector4_f32[1] / V2.vector4_f32[1];
2963 Result.vector4_f32[2] = V1.vector4_f32[2] / V2.vector4_f32[2];
2964 Result.vector4_f32[3] = V1.vector4_f32[3] / V2.vector4_f32[3];
2965 return Result;
2966#elif defined(_XM_ARM_NEON_INTRINSICS_)
2967 // 2 iterations of Newton-Raphson refinement of reciprocal
2968 __n128 Reciprocal = vrecpeq_f32(V2);
2969 __n128 S = vrecpsq_f32( Reciprocal, V2 );
2970 Reciprocal = vmulq_f32( S, Reciprocal );
2971 S = vrecpsq_f32( Reciprocal, V2 );
2972 Reciprocal = vmulq_f32( S, Reciprocal );
2973 return vmulq_f32( V1, Reciprocal );
2974#elif defined(_XM_SSE_INTRINSICS_)
2975 return _mm_div_ps( V1, V2 );
2976#else // _XM_VMX128_INTRINSICS_
2977#endif // _XM_VMX128_INTRINSICS_
2978}
2979
2980//------------------------------------------------------------------------------
2981
2982inline XMVECTOR XMVectorNegativeMultiplySubtract
2983(
2984 FXMVECTOR V1,
2985 FXMVECTOR V2,
2986 FXMVECTOR V3
2987)
2988{
2989#if defined(_XM_NO_INTRINSICS_)
2990
2991 XMVECTOR vResult = {
2992 V3.vector4_f32[0] - (V1.vector4_f32[0] * V2.vector4_f32[0]),
2993 V3.vector4_f32[1] - (V1.vector4_f32[1] * V2.vector4_f32[1]),
2994 V3.vector4_f32[2] - (V1.vector4_f32[2] * V2.vector4_f32[2]),
2995 V3.vector4_f32[3] - (V1.vector4_f32[3] * V2.vector4_f32[3])
2996 };
2997 return vResult;
2998
2999#elif defined(_XM_ARM_NEON_INTRINSICS_)
3000 return vmlsq_f32( V3, V1, V2 );
3001#elif defined(_XM_SSE_INTRINSICS_)
3002 XMVECTOR R = _mm_mul_ps( V1, V2 );
3003 return _mm_sub_ps( V3, R );
3004#else // _XM_VMX128_INTRINSICS_
3005#endif // _XM_VMX128_INTRINSICS_
3006}
3007
3008//------------------------------------------------------------------------------
3009
3010inline XMVECTOR XMVectorScale
3011(
3012 FXMVECTOR V,
3013 float ScaleFactor
3014)
3015{
3016#if defined(_XM_NO_INTRINSICS_)
3017 XMVECTOR vResult = {
3018 V.vector4_f32[0] * ScaleFactor,
3019 V.vector4_f32[1] * ScaleFactor,
3020 V.vector4_f32[2] * ScaleFactor,
3021 V.vector4_f32[3] * ScaleFactor
3022 };
3023 return vResult;
3024
3025#elif defined(_XM_ARM_NEON_INTRINSICS_)
3026 return vmulq_n_f32( V, ScaleFactor );
3027#elif defined(_XM_SSE_INTRINSICS_)
3028 XMVECTOR vResult = _mm_set_ps1(ScaleFactor);
3029 return _mm_mul_ps(vResult,V);
3030#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3031#endif // _XM_VMX128_INTRINSICS_
3032}
3033
3034//------------------------------------------------------------------------------
3035
3036inline XMVECTOR XMVectorReciprocalEst
3037(
3038 FXMVECTOR V
3039)
3040{
3041#if defined(_XM_NO_INTRINSICS_)
3042 XMVECTOR Result;
3043 Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
3044 Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
3045 Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
3046 Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
3047 return Result;
3048#elif defined(_XM_ARM_NEON_INTRINSICS_)
3049 return vrecpeq_f32(V);
3050#elif defined(_XM_SSE_INTRINSICS_)
3051 return _mm_rcp_ps(V);
3052#else // _XM_VMX128_INTRINSICS_
3053#endif // _XM_VMX128_INTRINSICS_
3054}
3055
3056//------------------------------------------------------------------------------
3057
3058inline XMVECTOR XMVectorReciprocal
3059(
3060 FXMVECTOR V
3061)
3062{
3063#if defined(_XM_NO_INTRINSICS_)
3064 XMVECTOR Result;
3065 Result.vector4_f32[0] = 1.f / V.vector4_f32[0];
3066 Result.vector4_f32[1] = 1.f / V.vector4_f32[1];
3067 Result.vector4_f32[2] = 1.f / V.vector4_f32[2];
3068 Result.vector4_f32[3] = 1.f / V.vector4_f32[3];
3069 return Result;
3070#elif defined(_XM_ARM_NEON_INTRINSICS_)
3071 // 2 iterations of Newton-Raphson refinement
3072 __n128 Reciprocal = vrecpeq_f32(V);
3073 __n128 S = vrecpsq_f32( Reciprocal, V );
3074 Reciprocal = vmulq_f32( S, Reciprocal );
3075 S = vrecpsq_f32( Reciprocal, V );
3076 return vmulq_f32( S, Reciprocal );
3077#elif defined(_XM_SSE_INTRINSICS_)
3078 return _mm_div_ps(g_XMOne,V);
3079#else // _XM_VMX128_INTRINSICS_
3080#endif // _XM_VMX128_INTRINSICS_
3081}
3082
3083//------------------------------------------------------------------------------
3084// Return an estimated square root
3085inline XMVECTOR XMVectorSqrtEst
3086(
3087 FXMVECTOR V
3088)
3089{
3090#if defined(_XM_NO_INTRINSICS_)
3091 XMVECTOR Result;
3092 Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
3093 Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
3094 Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
3095 Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
3096 return Result;
3097#elif defined(_XM_ARM_NEON_INTRINSICS_)
3098 // 1 iteration of Newton-Raphson refinment of sqrt
3099 __n128 S0 = vrsqrteq_f32(V);
3100 __n128 P0 = vmulq_f32( V, S0 );
3101 __n128 R0 = vrsqrtsq_f32( P0, S0 );
3102 __n128 S1 = vmulq_f32( S0, R0 );
3103
3104 XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
3105 XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
3106 __n128 Result = vmulq_f32( V, S1 );
3107 XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
3108 return XMVectorSelect(V, Result, Select);
3109#elif defined(_XM_SSE_INTRINSICS_)
3110 return _mm_sqrt_ps(V);
3111#else // _XM_VMX128_INTRINSICS_
3112#endif // _XM_VMX128_INTRINSICS_
3113}
3114
3115//------------------------------------------------------------------------------
3116
3117inline XMVECTOR XMVectorSqrt
3118(
3119 FXMVECTOR V
3120)
3121{
3122#if defined(_XM_NO_INTRINSICS_)
3123 XMVECTOR Result;
3124 Result.vector4_f32[0] = sqrtf( V.vector4_f32[0] );
3125 Result.vector4_f32[1] = sqrtf( V.vector4_f32[1] );
3126 Result.vector4_f32[2] = sqrtf( V.vector4_f32[2] );
3127 Result.vector4_f32[3] = sqrtf( V.vector4_f32[3] );
3128 return Result;
3129#elif defined(_XM_ARM_NEON_INTRINSICS_)
3130 // 3 iterations of Newton-Raphson refinment of sqrt
3131 __n128 S0 = vrsqrteq_f32(V);
3132 __n128 P0 = vmulq_f32( V, S0 );
3133 __n128 R0 = vrsqrtsq_f32( P0, S0 );
3134 __n128 S1 = vmulq_f32( S0, R0 );
3135 __n128 P1 = vmulq_f32( V, S1 );
3136 __n128 R1 = vrsqrtsq_f32( P1, S1 );
3137 __n128 S2 = vmulq_f32( S1, R1 );
3138 __n128 P2 = vmulq_f32( V, S2 );
3139 __n128 R2 = vrsqrtsq_f32( P2, S2 );
3140 __n128 S3 = vmulq_f32( S2, R2 );
3141
3142 XMVECTOR VEqualsInfinity = XMVectorEqualInt(V, g_XMInfinity.v);
3143 XMVECTOR VEqualsZero = XMVectorEqual(V, vdupq_n_f32(0) );
3144 __n128 Result = vmulq_f32( V, S3 );
3145 XMVECTOR Select = XMVectorEqualInt(VEqualsInfinity, VEqualsZero);
3146 return XMVectorSelect(V, Result, Select);
3147#elif defined(_XM_SSE_INTRINSICS_)
3148 return _mm_sqrt_ps(V);
3149#else // _XM_VMX128_INTRINSICS_
3150#endif // _XM_VMX128_INTRINSICS_
3151}
3152
3153//------------------------------------------------------------------------------
3154
3155inline XMVECTOR XMVectorReciprocalSqrtEst
3156(
3157 FXMVECTOR V
3158)
3159{
3160#if defined(_XM_NO_INTRINSICS_)
3161 XMVECTOR Result;
3162 Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
3163 Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
3164 Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
3165 Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
3166 return Result;
3167#elif defined(_XM_ARM_NEON_INTRINSICS_)
3168 return vrsqrteq_f32(V);
3169#elif defined(_XM_SSE_INTRINSICS_)
3170 return _mm_rsqrt_ps(V);
3171#else // _XM_VMX128_INTRINSICS_
3172#endif // _XM_VMX128_INTRINSICS_
3173}
3174
3175//------------------------------------------------------------------------------
3176
3177inline XMVECTOR XMVectorReciprocalSqrt
3178(
3179 FXMVECTOR V
3180)
3181{
3182#if defined(_XM_NO_INTRINSICS_)
3183 XMVECTOR Result;
3184 Result.vector4_f32[0] = 1.f / sqrtf( V.vector4_f32[0] );
3185 Result.vector4_f32[1] = 1.f / sqrtf( V.vector4_f32[1] );
3186 Result.vector4_f32[2] = 1.f / sqrtf( V.vector4_f32[2] );
3187 Result.vector4_f32[3] = 1.f / sqrtf( V.vector4_f32[3] );
3188 return Result;
3189#elif defined(_XM_ARM_NEON_INTRINSICS_)
3190 // 2 iterations of Newton-Raphson refinement of reciprocal
3191 __n128 S0 = vrsqrteq_f32(V);
3192
3193 __n128 P0 = vmulq_f32( V, S0 );
3194 __n128 R0 = vrsqrtsq_f32( P0, S0 );
3195
3196 __n128 S1 = vmulq_f32( S0, R0 );
3197 __n128 P1 = vmulq_f32( V, S1 );
3198 __n128 R1 = vrsqrtsq_f32( P1, S1 );
3199
3200 return vmulq_f32( S1, R1 );
3201#elif defined(_XM_SSE_INTRINSICS_)
3202 XMVECTOR vResult = _mm_sqrt_ps(V);
3203 vResult = _mm_div_ps(g_XMOne,vResult);
3204 return vResult;
3205#else // _XM_VMX128_INTRINSICS_
3206#endif // _XM_VMX128_INTRINSICS_
3207}
3208
3209
3210//------------------------------------------------------------------------------
3211
3212inline XMVECTOR XMVectorExp
3213(
3214 FXMVECTOR V
3215)
3216{
3217#if defined(_XM_NO_INTRINSICS_)
3218
3219 XMVECTOR Result;
3220 Result.vector4_f32[0] = powf(2.0f, V.vector4_f32[0]);
3221 Result.vector4_f32[1] = powf(2.0f, V.vector4_f32[1]);
3222 Result.vector4_f32[2] = powf(2.0f, V.vector4_f32[2]);
3223 Result.vector4_f32[3] = powf(2.0f, V.vector4_f32[3]);
3224 return Result;
3225
3226#elif defined(_XM_ARM_NEON_INTRINSICS_)
3227 XMVECTORF32 vResult = {
3228 powf(2.0f,vgetq_lane_f32(V, 0)),
3229 powf(2.0f,vgetq_lane_f32(V, 1)),
3230 powf(2.0f,vgetq_lane_f32(V, 2)),
3231 powf(2.0f,vgetq_lane_f32(V, 3))
3232 };
3233 return vResult;
3234#elif defined(_XM_SSE_INTRINSICS_)
3235 __declspec(align(16)) float a[4];
3236 _mm_store_ps( a, V );
3237 XMVECTOR vResult = _mm_setr_ps(
3238 powf(2.0f,a[0]),
3239 powf(2.0f,a[1]),
3240 powf(2.0f,a[2]),
3241 powf(2.0f,a[3]));
3242 return vResult;
3243#else // _XM_VMX128_INTRINSICS_
3244#endif // _XM_VMX128_INTRINSICS_
3245}
3246
3247
3248//------------------------------------------------------------------------------
3249
3250inline XMVECTOR XMVectorLog
3251(
3252 FXMVECTOR V
3253)
3254{
3255#if defined(_XM_NO_INTRINSICS_)
3256
3257 const float fScale = 1.4426950f; // (1.0f / logf(2.0f));
3258
3259 XMVECTOR Result;
3260 Result.vector4_f32[0] = logf(V.vector4_f32[0])*fScale;
3261 Result.vector4_f32[1] = logf(V.vector4_f32[1])*fScale;
3262 Result.vector4_f32[2] = logf(V.vector4_f32[2])*fScale;
3263 Result.vector4_f32[3] = logf(V.vector4_f32[3])*fScale;
3264 return Result;
3265
3266#elif defined(_XM_ARM_NEON_INTRINSICS_)
3267 XMVECTOR vScale = vdupq_n_f32(1.0f / logf(2.0f));
3268 XMVECTORF32 vResult = {
3269 logf(vgetq_lane_f32(V, 0)),
3270 logf(vgetq_lane_f32(V, 1)),
3271 logf(vgetq_lane_f32(V, 2)),
3272 logf(vgetq_lane_f32(V, 3))
3273 };
3274 return vmulq_f32( vResult, vScale );
3275#elif defined(_XM_SSE_INTRINSICS_)
3276 __declspec(align(16)) float a[4];
3277 _mm_store_ps( a, V );
3278 XMVECTOR vScale = _mm_set_ps1(1.0f / logf(2.0f));
3279 XMVECTOR vResult = _mm_setr_ps(
3280 logf(a[0]),
3281 logf(a[1]),
3282 logf(a[2]),
3283 logf(a[3]));
3284 vResult = _mm_mul_ps(vResult,vScale);
3285 return vResult;
3286#else // _XM_VMX128_INTRINSICS_
3287#endif // _XM_VMX128_INTRINSICS_
3288}
3289
3290
3291//------------------------------------------------------------------------------
3292
3293inline XMVECTOR XMVectorPow
3294(
3295 FXMVECTOR V1,
3296 FXMVECTOR V2
3297)
3298{
3299#if defined(_XM_NO_INTRINSICS_)
3300
3301 XMVECTOR Result;
3302 Result.vector4_f32[0] = powf(V1.vector4_f32[0], V2.vector4_f32[0]);
3303 Result.vector4_f32[1] = powf(V1.vector4_f32[1], V2.vector4_f32[1]);
3304 Result.vector4_f32[2] = powf(V1.vector4_f32[2], V2.vector4_f32[2]);
3305 Result.vector4_f32[3] = powf(V1.vector4_f32[3], V2.vector4_f32[3]);
3306 return Result;
3307
3308#elif defined(_XM_ARM_NEON_INTRINSICS_)
3309 XMVECTORF32 vResult = {
3310 powf(vgetq_lane_f32(V1, 0), vgetq_lane_f32(V2, 0)),
3311 powf(vgetq_lane_f32(V1, 1), vgetq_lane_f32(V2, 1)),
3312 powf(vgetq_lane_f32(V1, 2), vgetq_lane_f32(V2, 2)),
3313 powf(vgetq_lane_f32(V1, 3), vgetq_lane_f32(V2, 3))
3314 };
3315 return vResult;
3316#elif defined(_XM_SSE_INTRINSICS_)
3317 __declspec(align(16)) float a[4];
3318 __declspec(align(16)) float b[4];
3319 _mm_store_ps( a, V1 );
3320 _mm_store_ps( b, V2 );
3321 XMVECTOR vResult = _mm_setr_ps(
3322 powf(a[0],b[0]),
3323 powf(a[1],b[1]),
3324 powf(a[2],b[2]),
3325 powf(a[3],b[3]));
3326 return vResult;
3327#else // _XM_VMX128_INTRINSICS_
3328#endif // _XM_VMX128_INTRINSICS_
3329}
3330
3331//------------------------------------------------------------------------------
3332
3333inline XMVECTOR XMVectorAbs
3334(
3335 FXMVECTOR V
3336)
3337{
3338#if defined(_XM_NO_INTRINSICS_)
3339 XMVECTOR vResult = {
3340 fabsf(V.vector4_f32[0]),
3341 fabsf(V.vector4_f32[1]),
3342 fabsf(V.vector4_f32[2]),
3343 fabsf(V.vector4_f32[3])
3344 };
3345 return vResult;
3346
3347#elif defined(_XM_ARM_NEON_INTRINSICS_)
3348 return vabsq_f32( V );
3349#elif defined(_XM_SSE_INTRINSICS_)
3350 XMVECTOR vResult = _mm_setzero_ps();
3351 vResult = _mm_sub_ps(vResult,V);
3352 vResult = _mm_max_ps(vResult,V);
3353 return vResult;
3354#else // _XM_VMX128_INTRINSICS_
3355#endif // _XM_VMX128_INTRINSICS_
3356}
3357
3358//------------------------------------------------------------------------------
3359
3360inline XMVECTOR XMVectorMod
3361(
3362 FXMVECTOR V1,
3363 FXMVECTOR V2
3364)
3365{
3366 // V1 % V2 = V1 - V2 * truncate(V1 / V2)
3367
3368#if defined(_XM_NO_INTRINSICS_)
3369
3370 XMVECTOR Quotient = XMVectorDivide(V1, V2);
3371 Quotient = XMVectorTruncate(Quotient);
3372 XMVECTOR Result = XMVectorNegativeMultiplySubtract(V2, Quotient, V1);
3373 return Result;
3374
3375#elif defined(_XM_ARM_NEON_INTRINSICS_)
3376 XMVECTOR vResult = XMVectorDivide(V1, V2);
3377 vResult = XMVectorTruncate(vResult);
3378 return vmlsq_f32( V1, vResult, V2 );
3379#elif defined(_XM_SSE_INTRINSICS_)
3380 XMVECTOR vResult = _mm_div_ps(V1, V2);
3381 vResult = XMVectorTruncate(vResult);
3382 vResult = _mm_mul_ps(vResult,V2);
3383 vResult = _mm_sub_ps(V1,vResult);
3384 return vResult;
3385#else // _XM_VMX128_INTRINSICS_
3386#endif // _XM_VMX128_INTRINSICS_
3387}
3388
3389//------------------------------------------------------------------------------
3390
3391inline XMVECTOR XMVectorModAngles
3392(
3393 FXMVECTOR Angles
3394)
3395{
3396#if defined(_XM_NO_INTRINSICS_)
3397
3398 XMVECTOR V;
3399 XMVECTOR Result;
3400
3401 // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
3402 V = XMVectorMultiply(Angles, g_XMReciprocalTwoPi.v);
3403 V = XMVectorRound(V);
3404 Result = XMVectorNegativeMultiplySubtract(g_XMTwoPi.v, V, Angles);
3405 return Result;
3406
3407#elif defined(_XM_ARM_NEON_INTRINSICS_)
3408 // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
3409 XMVECTOR vResult = vmulq_f32(Angles,g_XMReciprocalTwoPi);
3410 // Use the inline function due to complexity for rounding
3411 vResult = XMVectorRound(vResult);
3412 return vmlsq_f32( Angles, vResult, g_XMTwoPi );
3413#elif defined(_XM_SSE_INTRINSICS_)
3414 // Modulo the range of the given angles such that -XM_PI <= Angles < XM_PI
3415 XMVECTOR vResult = _mm_mul_ps(Angles,g_XMReciprocalTwoPi);
3416 // Use the inline function due to complexity for rounding
3417 vResult = XMVectorRound(vResult);
3418 vResult = _mm_mul_ps(vResult,g_XMTwoPi);
3419 vResult = _mm_sub_ps(Angles,vResult);
3420 return vResult;
3421#else // _XM_VMX128_INTRINSICS_
3422#endif // _XM_VMX128_INTRINSICS_
3423}
3424
3425//------------------------------------------------------------------------------
3426
3427inline XMVECTOR XMVectorSin
3428(
3429 FXMVECTOR V
3430)
3431{
3432 // 11-degree minimax approximation
3433
3434#if defined(_XM_NO_INTRINSICS_)
3435 XMVECTOR Result;
3436 Result.vector4_f32[0] = XMScalarSin( V.vector4_f32[0] );
3437 Result.vector4_f32[1] = XMScalarSin( V.vector4_f32[1] );
3438 Result.vector4_f32[2] = XMScalarSin( V.vector4_f32[2] );
3439 Result.vector4_f32[3] = XMScalarSin( V.vector4_f32[3] );
3440 return Result;
3441#elif defined(_XM_ARM_NEON_INTRINSICS_)
3442 // Force the value within the bounds of pi
3443 XMVECTOR x = XMVectorModAngles(V);
3444
3445 // Map in [-pi/2,pi/2] with sin(y) = sin(x).
3446 __n128 sign = vandq_u32(x, g_XMNegativeZero);
3447 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
3448 __n128 absx = vabsq_f32( x );
3449 __n128 rflx = vsubq_f32(c, x);
3450 __n128 comp = vcleq_f32(absx, g_XMHalfPi);
3451 x = vbslq_f32( comp, x, rflx );
3452
3453 __n128 x2 = vmulq_f32(x, x);
3454
3455 // Compute polynomial approximation
3456 const XMVECTOR SC1 = g_XMSinCoefficients1;
3457 XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0);
3458
3459 const XMVECTOR SC0 = g_XMSinCoefficients0;
3460 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
3461 Result = vmlaq_f32(vConstants, Result, x2);
3462
3463 vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
3464 Result = vmlaq_f32(vConstants, Result, x2);
3465
3466 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
3467 Result = vmlaq_f32(vConstants, Result, x2);
3468
3469 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
3470 Result = vmlaq_f32(vConstants, Result, x2);
3471
3472 Result = vmlaq_f32(g_XMOne, Result, x2);
3473 Result = vmulq_f32(Result, x);
3474 return Result;
3475#elif defined(_XM_SSE_INTRINSICS_)
3476 // Force the value within the bounds of pi
3477 XMVECTOR x = XMVectorModAngles(V);
3478
3479 // Map in [-pi/2,pi/2] with sin(y) = sin(x).
3480 __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
3481 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
3482 __m128 absx = _mm_andnot_ps(sign, x); // |x|
3483 __m128 rflx = _mm_sub_ps(c, x);
3484 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
3485 __m128 select0 = _mm_and_ps(comp, x);
3486 __m128 select1 = _mm_andnot_ps(comp, rflx);
3487 x = _mm_or_ps(select0, select1);
3488
3489 __m128 x2 = _mm_mul_ps(x, x);
3490
3491 // Compute polynomial approximation
3492 const XMVECTOR SC1 = g_XMSinCoefficients1;
3493 XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
3494 __m128 Result = _mm_mul_ps(vConstants, x2);
3495
3496 const XMVECTOR SC0 = g_XMSinCoefficients0;
3497 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
3498 Result = _mm_add_ps(Result, vConstants);
3499 Result = _mm_mul_ps(Result, x2);
3500
3501 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
3502 Result = _mm_add_ps(Result, vConstants);
3503 Result = _mm_mul_ps(Result, x2);
3504
3505 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
3506 Result = _mm_add_ps(Result, vConstants);
3507 Result = _mm_mul_ps(Result, x2);
3508
3509 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
3510 Result = _mm_add_ps(Result, vConstants);
3511 Result = _mm_mul_ps(Result, x2);
3512 Result = _mm_add_ps(Result, g_XMOne);
3513 Result = _mm_mul_ps(Result, x);
3514 return Result;
3515#else // _XM_VMX128_INTRINSICS_
3516#endif // _XM_VMX128_INTRINSICS_
3517}
3518
3519//------------------------------------------------------------------------------
3520
3521inline XMVECTOR XMVectorCos
3522(
3523 FXMVECTOR V
3524)
3525{
3526 // 10-degree minimax approximation
3527
3528#if defined(_XM_NO_INTRINSICS_)
3529 XMVECTOR Result;
3530 Result.vector4_f32[0] = XMScalarCos( V.vector4_f32[0] );
3531 Result.vector4_f32[1] = XMScalarCos( V.vector4_f32[1] );
3532 Result.vector4_f32[2] = XMScalarCos( V.vector4_f32[2] );
3533 Result.vector4_f32[3] = XMScalarCos( V.vector4_f32[3] );
3534 return Result;
3535#elif defined(_XM_ARM_NEON_INTRINSICS_)
3536 // Map V to x in [-pi,pi].
3537 XMVECTOR x = XMVectorModAngles(V);
3538
3539 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
3540 __n128 sign = vandq_u32(x, g_XMNegativeZero);
3541 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
3542 __n128 absx = vabsq_f32( x );
3543 __n128 rflx = vsubq_f32(c, x);
3544 __n128 comp = vcleq_f32(absx, g_XMHalfPi);
3545 x = vbslq_f32( comp, x, rflx );
3546 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
3547
3548 __n128 x2 = vmulq_f32(x, x);
3549
3550 // Compute polynomial approximation
3551 const XMVECTOR CC1 = g_XMCosCoefficients1;
3552 XMVECTOR Result = vdupq_lane_f32(vget_low_f32(CC1), 0);
3553
3554 const XMVECTOR CC0 = g_XMCosCoefficients0;
3555 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
3556 Result = vmlaq_f32(vConstants, Result, x2);
3557
3558 vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
3559 Result = vmlaq_f32(vConstants, Result, x2);
3560
3561 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
3562 Result = vmlaq_f32(vConstants, Result, x2);
3563
3564 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
3565 Result = vmlaq_f32(vConstants, Result, x2);
3566
3567 Result = vmlaq_f32(g_XMOne, Result, x2);
3568 Result = vmulq_f32(Result, sign);
3569 return Result;
3570#elif defined(_XM_SSE_INTRINSICS_)
3571 // Map V to x in [-pi,pi].
3572 XMVECTOR x = XMVectorModAngles(V);
3573
3574 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
3575 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
3576 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
3577 __m128 absx = _mm_andnot_ps(sign, x); // |x|
3578 __m128 rflx = _mm_sub_ps(c, x);
3579 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
3580 __m128 select0 = _mm_and_ps(comp, x);
3581 __m128 select1 = _mm_andnot_ps(comp, rflx);
3582 x = _mm_or_ps(select0, select1);
3583 select0 = _mm_and_ps(comp, g_XMOne);
3584 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
3585 sign = _mm_or_ps(select0, select1);
3586
3587 __m128 x2 = _mm_mul_ps(x, x);
3588
3589 // Compute polynomial approximation
3590 const XMVECTOR CC1 = g_XMCosCoefficients1;
3591 XMVECTOR vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
3592 __m128 Result = _mm_mul_ps(vConstants, x2);
3593
3594 const XMVECTOR CC0 = g_XMCosCoefficients0;
3595 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
3596 Result = _mm_add_ps(Result, vConstants);
3597 Result = _mm_mul_ps(Result, x2);
3598
3599 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
3600 Result = _mm_add_ps(Result, vConstants);
3601 Result = _mm_mul_ps(Result, x2);
3602
3603 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
3604 Result = _mm_add_ps(Result, vConstants);
3605 Result = _mm_mul_ps(Result, x2);
3606
3607 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
3608 Result = _mm_add_ps(Result, vConstants);
3609 Result = _mm_mul_ps(Result, x2);
3610 Result = _mm_add_ps(Result, g_XMOne);
3611 Result = _mm_mul_ps(Result, sign);
3612 return Result;
3613#else // _XM_VMX128_INTRINSICS_
3614#endif // _XM_VMX128_INTRINSICS_
3615}
3616
3617//------------------------------------------------------------------------------
3618
3619_Use_decl_annotations_
3620inline void XMVectorSinCos
3621(
3622 XMVECTOR* pSin,
3623 XMVECTOR* pCos,
3624 FXMVECTOR V
3625)
3626{
3627 assert(pSin != NULL);
3628 assert(pCos != NULL);
3629
3630 // 11/10-degree minimax approximation
3631
3632#if defined(_XM_NO_INTRINSICS_)
3633 XMVECTOR Sin;
3634 XMVECTOR Cos;
3635
3636 XMScalarSinCos(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]);
3637 XMScalarSinCos(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]);
3638 XMScalarSinCos(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]);
3639 XMScalarSinCos(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]);
3640
3641 *pSin = Sin;
3642 *pCos = Cos;
3643#elif defined(_XM_ARM_NEON_INTRINSICS_)
3644 // Force the value within the bounds of pi
3645 XMVECTOR x = XMVectorModAngles(V);
3646
3647 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
3648 __n128 sign = vandq_u32(x, g_XMNegativeZero);
3649 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
3650 __n128 absx = vabsq_f32( x );
3651 __n128 rflx = vsubq_f32(c, x);
3652 __n128 comp = vcleq_f32(absx, g_XMHalfPi);
3653 x = vbslq_f32( comp, x, rflx );
3654 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
3655
3656 __n128 x2 = vmulq_f32(x, x);
3657
3658 // Compute polynomial approximation for sine
3659 const XMVECTOR SC1 = g_XMSinCoefficients1;
3660 XMVECTOR Result = vdupq_lane_f32(vget_low_f32(SC1), 0);
3661
3662 const XMVECTOR SC0 = g_XMSinCoefficients0;
3663 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SC0), 1);
3664 Result = vmlaq_f32(vConstants, Result, x2);
3665
3666 vConstants = vdupq_lane_f32(vget_high_f32(SC0), 0);
3667 Result = vmlaq_f32(vConstants, Result, x2);
3668
3669 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 1);
3670 Result = vmlaq_f32(vConstants, Result, x2);
3671
3672 vConstants = vdupq_lane_f32(vget_low_f32(SC0), 0);
3673 Result = vmlaq_f32(vConstants, Result, x2);
3674
3675 Result = vmlaq_f32(g_XMOne, Result, x2);
3676 *pSin = vmulq_f32(Result, x);
3677
3678 // Compute polynomial approximation for cosine
3679 const XMVECTOR CC1 = g_XMCosCoefficients1;
3680 Result = vdupq_lane_f32(vget_low_f32(CC1), 0);
3681
3682 const XMVECTOR CC0 = g_XMCosCoefficients0;
3683 vConstants = vdupq_lane_f32(vget_high_f32(CC0), 1);
3684 Result = vmlaq_f32(vConstants, Result, x2);
3685
3686 vConstants = vdupq_lane_f32(vget_high_f32(CC0), 0);
3687 Result = vmlaq_f32(vConstants, Result, x2);
3688
3689 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 1);
3690 Result = vmlaq_f32(vConstants, Result, x2);
3691
3692 vConstants = vdupq_lane_f32(vget_low_f32(CC0), 0);
3693 Result = vmlaq_f32(vConstants, Result, x2);
3694
3695 Result = vmlaq_f32(g_XMOne, Result, x2);
3696 *pCos = vmulq_f32(Result, sign);
3697#elif defined(_XM_SSE_INTRINSICS_)
3698 // Force the value within the bounds of pi
3699 XMVECTOR x = XMVectorModAngles(V);
3700
3701 // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
3702 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
3703 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
3704 __m128 absx = _mm_andnot_ps(sign, x); // |x|
3705 __m128 rflx = _mm_sub_ps(c, x);
3706 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
3707 __m128 select0 = _mm_and_ps(comp, x);
3708 __m128 select1 = _mm_andnot_ps(comp, rflx);
3709 x = _mm_or_ps(select0, select1);
3710 select0 = _mm_and_ps(comp, g_XMOne);
3711 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
3712 sign = _mm_or_ps(select0, select1);
3713
3714 __m128 x2 = _mm_mul_ps(x, x);
3715
3716 // Compute polynomial approximation of sine
3717 const XMVECTOR SC1 = g_XMSinCoefficients1;
3718 XMVECTOR vConstants = XM_PERMUTE_PS( SC1, _MM_SHUFFLE(0, 0, 0, 0) );
3719 __m128 Result = _mm_mul_ps(vConstants, x2);
3720
3721 const XMVECTOR SC0 = g_XMSinCoefficients0;
3722 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(3, 3, 3, 3) );
3723 Result = _mm_add_ps(Result, vConstants);
3724 Result = _mm_mul_ps(Result, x2);
3725
3726 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(2, 2, 2, 2) );
3727 Result = _mm_add_ps(Result, vConstants);
3728 Result = _mm_mul_ps(Result, x2);
3729
3730 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(1, 1, 1, 1) );
3731 Result = _mm_add_ps(Result, vConstants);
3732 Result = _mm_mul_ps(Result, x2);
3733
3734 vConstants = XM_PERMUTE_PS( SC0, _MM_SHUFFLE(0, 0, 0, 0) );
3735 Result = _mm_add_ps(Result, vConstants);
3736 Result = _mm_mul_ps(Result, x2);
3737 Result = _mm_add_ps(Result, g_XMOne);
3738 Result = _mm_mul_ps(Result, x);
3739 *pSin = Result;
3740
3741 // Compute polynomial approximation of cosine
3742 const XMVECTOR CC1 = g_XMCosCoefficients1;
3743 vConstants = XM_PERMUTE_PS( CC1, _MM_SHUFFLE(0, 0, 0, 0) );
3744 Result = _mm_mul_ps(vConstants, x2);
3745
3746 const XMVECTOR CC0 = g_XMCosCoefficients0;
3747 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(3, 3, 3, 3) );
3748 Result = _mm_add_ps(Result, vConstants);
3749 Result = _mm_mul_ps(Result, x2);
3750
3751 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(2, 2, 2, 2) );
3752 Result = _mm_add_ps(Result, vConstants);
3753 Result = _mm_mul_ps(Result, x2);
3754
3755 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(1, 1, 1, 1) );
3756 Result = _mm_add_ps(Result, vConstants);
3757 Result = _mm_mul_ps(Result, x2);
3758
3759 vConstants = XM_PERMUTE_PS( CC0, _MM_SHUFFLE(0, 0, 0, 0) );
3760 Result = _mm_add_ps(Result, vConstants);
3761 Result = _mm_mul_ps(Result, x2);
3762 Result = _mm_add_ps(Result, g_XMOne);
3763 Result = _mm_mul_ps(Result, sign);
3764 *pCos = Result;
3765#else // _XM_VMX128_INTRINSICS_
3766#endif // _XM_VMX128_INTRINSICS_
3767}
3768
3769//------------------------------------------------------------------------------
3770
3771inline XMVECTOR XMVectorTan
3772(
3773 FXMVECTOR V
3774)
3775{
3776 // Cody and Waite algorithm to compute tangent.
3777
3778#if defined(_XM_NO_INTRINSICS_)
3779 XMVECTOR Result;
3780 Result.vector4_f32[0] = tanf( V.vector4_f32[0] );
3781 Result.vector4_f32[1] = tanf( V.vector4_f32[1] );
3782 Result.vector4_f32[2] = tanf( V.vector4_f32[2] );
3783 Result.vector4_f32[3] = tanf( V.vector4_f32[3] );
3784 return Result;
3785#elif defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
3786
3787 static const XMVECTORF32 TanCoefficients0 = {1.0f, -4.667168334e-1f, 2.566383229e-2f, -3.118153191e-4f};
3788 static const XMVECTORF32 TanCoefficients1 = {4.981943399e-7f, -1.333835001e-1f, 3.424887824e-3f, -1.786170734e-5f};
3789 static const XMVECTORF32 TanConstants = {1.570796371f, 6.077100628e-11f, 0.000244140625f, 0.63661977228f /*2 / Pi*/ };
3790 static const XMVECTORU32 Mask = {0x1, 0x1, 0x1, 0x1};
3791
3792 XMVECTOR TwoDivPi = XMVectorSplatW(TanConstants.v);
3793
3794 XMVECTOR Zero = XMVectorZero();
3795
3796 XMVECTOR C0 = XMVectorSplatX(TanConstants.v);
3797 XMVECTOR C1 = XMVectorSplatY(TanConstants.v);
3798 XMVECTOR Epsilon = XMVectorSplatZ(TanConstants.v);
3799
3800 XMVECTOR VA = XMVectorMultiply(V, TwoDivPi);
3801
3802 VA = XMVectorRound(VA);
3803
3804 XMVECTOR VC = XMVectorNegativeMultiplySubtract(VA, C0, V);
3805
3806 XMVECTOR VB = XMVectorAbs(VA);
3807
3808 VC = XMVectorNegativeMultiplySubtract(VA, C1, VC);
3809
3810#if defined(_XM_ARM_NEON_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
3811 VB = vcvtq_u32_f32( VB );
3812#elif defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
3813 reinterpret_cast<__m128i *>(&VB)[0] = _mm_cvttps_epi32(VB);
3814#else
3815 for (size_t i = 0; i < 4; i++)
3816 {
3817 VB.vector4_u32[i] = (uint32_t)VB.vector4_f32[i];
3818 }
3819#endif
3820
3821 XMVECTOR VC2 = XMVectorMultiply(VC, VC);
3822
3823 XMVECTOR T7 = XMVectorSplatW(TanCoefficients1.v);
3824 XMVECTOR T6 = XMVectorSplatZ(TanCoefficients1.v);
3825 XMVECTOR T4 = XMVectorSplatX(TanCoefficients1.v);
3826 XMVECTOR T3 = XMVectorSplatW(TanCoefficients0.v);
3827 XMVECTOR T5 = XMVectorSplatY(TanCoefficients1.v);
3828 XMVECTOR T2 = XMVectorSplatZ(TanCoefficients0.v);
3829 XMVECTOR T1 = XMVectorSplatY(TanCoefficients0.v);
3830 XMVECTOR T0 = XMVectorSplatX(TanCoefficients0.v);
3831
3832 XMVECTOR VBIsEven = XMVectorAndInt(VB, Mask.v);
3833 VBIsEven = XMVectorEqualInt(VBIsEven, Zero);
3834
3835 XMVECTOR N = XMVectorMultiplyAdd(VC2, T7, T6);
3836 XMVECTOR D = XMVectorMultiplyAdd(VC2, T4, T3);
3837 N = XMVectorMultiplyAdd(VC2, N, T5);
3838 D = XMVectorMultiplyAdd(VC2, D, T2);
3839 N = XMVectorMultiply(VC2, N);
3840 D = XMVectorMultiplyAdd(VC2, D, T1);
3841 N = XMVectorMultiplyAdd(VC, N, VC);
3842 XMVECTOR VCNearZero = XMVectorInBounds(VC, Epsilon);
3843 D = XMVectorMultiplyAdd(VC2, D, T0);
3844
3845 N = XMVectorSelect(N, VC, VCNearZero);
3846 D = XMVectorSelect(D, g_XMOne.v, VCNearZero);
3847
3848 XMVECTOR R0 = XMVectorNegate(N);
3849 XMVECTOR R1 = XMVectorDivide(N,D);
3850 R0 = XMVectorDivide(D,R0);
3851
3852 XMVECTOR VIsZero = XMVectorEqual(V, Zero);
3853
3854 XMVECTOR Result = XMVectorSelect(R0, R1, VBIsEven);
3855
3856 Result = XMVectorSelect(Result, Zero, VIsZero);
3857
3858 return Result;
3859
3860#else // _XM_VMX128_INTRINSICS_
3861#endif // _XM_VMX128_INTRINSICS_
3862}
3863
3864//------------------------------------------------------------------------------
3865
3866inline XMVECTOR XMVectorSinH
3867(
3868 FXMVECTOR V
3869)
3870{
3871#if defined(_XM_NO_INTRINSICS_)
3872 XMVECTOR Result;
3873 Result.vector4_f32[0] = sinhf( V.vector4_f32[0] );
3874 Result.vector4_f32[1] = sinhf( V.vector4_f32[1] );
3875 Result.vector4_f32[2] = sinhf( V.vector4_f32[2] );
3876 Result.vector4_f32[3] = sinhf( V.vector4_f32[3] );
3877 return Result;
3878#elif defined(_XM_ARM_NEON_INTRINSICS_)
3879 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
3880
3881 XMVECTOR V1 = vmlaq_f32( g_XMNegativeOne.v, V, Scale.v );
3882 XMVECTOR V2 = vmlsq_f32( g_XMNegativeOne.v, V, Scale.v );
3883 XMVECTOR E1 = XMVectorExp(V1);
3884 XMVECTOR E2 = XMVectorExp(V2);
3885
3886 return vsubq_f32(E1, E2);
3887#elif defined(_XM_SSE_INTRINSICS_)
3888 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
3889
3890 XMVECTOR V1 = _mm_mul_ps(V, Scale);
3891 V1 = _mm_add_ps(V1,g_XMNegativeOne);
3892 XMVECTOR V2 = _mm_mul_ps(V, Scale);
3893 V2 = _mm_sub_ps(g_XMNegativeOne,V2);
3894 XMVECTOR E1 = XMVectorExp(V1);
3895 XMVECTOR E2 = XMVectorExp(V2);
3896
3897 return _mm_sub_ps(E1, E2);
3898#else // _XM_VMX128_INTRINSICS_
3899#endif // _XM_VMX128_INTRINSICS_
3900}
3901
3902//------------------------------------------------------------------------------
3903
3904inline XMVECTOR XMVectorCosH
3905(
3906 FXMVECTOR V
3907)
3908{
3909#if defined(_XM_NO_INTRINSICS_)
3910 XMVECTOR Result;
3911 Result.vector4_f32[0] = coshf( V.vector4_f32[0] );
3912 Result.vector4_f32[1] = coshf( V.vector4_f32[1] );
3913 Result.vector4_f32[2] = coshf( V.vector4_f32[2] );
3914 Result.vector4_f32[3] = coshf( V.vector4_f32[3] );
3915 return Result;
3916#elif defined(_XM_ARM_NEON_INTRINSICS_)
3917 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
3918
3919 XMVECTOR V1 = vmlaq_f32(g_XMNegativeOne.v, V, Scale.v);
3920 XMVECTOR V2 = vmlsq_f32(g_XMNegativeOne.v, V, Scale.v);
3921 XMVECTOR E1 = XMVectorExp(V1);
3922 XMVECTOR E2 = XMVectorExp(V2);
3923 return vaddq_f32(E1, E2);
3924#elif defined(_XM_SSE_INTRINSICS_)
3925 static const XMVECTORF32 Scale = {1.442695040888963f, 1.442695040888963f, 1.442695040888963f, 1.442695040888963f}; // 1.0f / ln(2.0f)
3926
3927 XMVECTOR V1 = _mm_mul_ps(V,Scale.v);
3928 V1 = _mm_add_ps(V1,g_XMNegativeOne.v);
3929 XMVECTOR V2 = _mm_mul_ps(V, Scale.v);
3930 V2 = _mm_sub_ps(g_XMNegativeOne.v,V2);
3931 XMVECTOR E1 = XMVectorExp(V1);
3932 XMVECTOR E2 = XMVectorExp(V2);
3933 return _mm_add_ps(E1, E2);
3934#else // _XM_VMX128_INTRINSICS_
3935#endif // _XM_VMX128_INTRINSICS_
3936}
3937
3938//------------------------------------------------------------------------------
3939
3940inline XMVECTOR XMVectorTanH
3941(
3942 FXMVECTOR V
3943)
3944{
3945#if defined(_XM_NO_INTRINSICS_)
3946 XMVECTOR Result;
3947 Result.vector4_f32[0] = tanhf( V.vector4_f32[0] );
3948 Result.vector4_f32[1] = tanhf( V.vector4_f32[1] );
3949 Result.vector4_f32[2] = tanhf( V.vector4_f32[2] );
3950 Result.vector4_f32[3] = tanhf( V.vector4_f32[3] );
3951 return Result;
3952#elif defined(_XM_ARM_NEON_INTRINSICS_)
3953 static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
3954
3955 XMVECTOR E = vmulq_f32(V, Scale.v);
3956 E = XMVectorExp(E);
3957 E = vmlaq_f32( g_XMOneHalf.v, E, g_XMOneHalf.v );
3958 E = XMVectorReciprocal(E);
3959 return vsubq_f32(g_XMOne.v, E);
3960#elif defined(_XM_SSE_INTRINSICS_)
3961 static const XMVECTORF32 Scale = {2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f, 2.8853900817779268f}; // 2.0f / ln(2.0f)
3962
3963 XMVECTOR E = _mm_mul_ps(V, Scale.v);
3964 E = XMVectorExp(E);
3965 E = _mm_mul_ps(E,g_XMOneHalf.v);
3966 E = _mm_add_ps(E,g_XMOneHalf.v);
3967 E = _mm_div_ps(g_XMOne.v,E);
3968 return _mm_sub_ps(g_XMOne.v,E);
3969#else // _XM_VMX128_INTRINSICS_
3970#endif // _XM_VMX128_INTRINSICS_
3971}
3972
3973//------------------------------------------------------------------------------
3974
3975inline XMVECTOR XMVectorASin
3976(
3977 FXMVECTOR V
3978)
3979{
3980 // 7-degree minimax approximation
3981
3982#if defined(_XM_NO_INTRINSICS_)
3983 XMVECTOR Result;
3984 Result.vector4_f32[0] = XMScalarASin( V.vector4_f32[0] );
3985 Result.vector4_f32[1] = XMScalarASin( V.vector4_f32[1] );
3986 Result.vector4_f32[2] = XMScalarASin( V.vector4_f32[2] );
3987 Result.vector4_f32[3] = XMScalarASin( V.vector4_f32[3] );
3988 return Result;
3989#elif defined(_XM_ARM_NEON_INTRINSICS_)
3990 __n128 nonnegative = vcgeq_f32(V, g_XMZero);
3991 __n128 x = vabsq_f32(V);
3992
3993 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
3994 __n128 oneMValue = vsubq_f32(g_XMOne, x);
3995 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
3996 __n128 root = XMVectorSqrt(clampOneMValue);
3997
3998 // Compute polynomial approximation
3999 const XMVECTOR AC1 = g_XMArcCoefficients1;
4000 __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1);
4001
4002 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
4003 t0 = vmlaq_f32( vConstants, t0, x );
4004
4005 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
4006 t0 = vmlaq_f32( vConstants, t0, x );
4007
4008 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
4009 t0 = vmlaq_f32( vConstants, t0, x );
4010
4011 const XMVECTOR AC0 = g_XMArcCoefficients0;
4012 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
4013 t0 = vmlaq_f32( vConstants, t0, x );
4014
4015 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
4016 t0 = vmlaq_f32( vConstants, t0, x );
4017
4018 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
4019 t0 = vmlaq_f32( vConstants, t0, x );
4020
4021 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
4022 t0 = vmlaq_f32( vConstants, t0, x );
4023 t0 = vmulq_f32(t0, root);
4024
4025 __n128 t1 = vsubq_f32(g_XMPi, t0);
4026 t0 = vbslq_f32( nonnegative, t0, t1 );
4027 t0 = vsubq_f32(g_XMHalfPi, t0);
4028 return t0;
4029#elif defined(_XM_SSE_INTRINSICS_)
4030 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
4031 __m128 mvalue = _mm_sub_ps(g_XMZero, V);
4032 __m128 x = _mm_max_ps(V, mvalue); // |V|
4033
4034 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
4035 __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
4036 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
4037 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
4038
4039 // Compute polynomial approximation
4040 const XMVECTOR AC1 = g_XMArcCoefficients1;
4041 XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
4042 __m128 t0 = _mm_mul_ps(vConstants, x);
4043
4044 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
4045 t0 = _mm_add_ps(t0, vConstants);
4046 t0 = _mm_mul_ps(t0, x);
4047
4048 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
4049 t0 = _mm_add_ps(t0, vConstants);
4050 t0 = _mm_mul_ps(t0, x);
4051
4052 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
4053 t0 = _mm_add_ps(t0, vConstants);
4054 t0 = _mm_mul_ps(t0, x);
4055
4056 const XMVECTOR AC0 = g_XMArcCoefficients0;
4057 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
4058 t0 = _mm_add_ps(t0, vConstants);
4059 t0 = _mm_mul_ps(t0, x);
4060
4061 vConstants = XM_PERMUTE_PS( AC0,_MM_SHUFFLE(2, 2, 2, 2) );
4062 t0 = _mm_add_ps(t0, vConstants);
4063 t0 = _mm_mul_ps(t0, x);
4064
4065 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
4066 t0 = _mm_add_ps(t0, vConstants);
4067 t0 = _mm_mul_ps(t0, x);
4068
4069 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
4070 t0 = _mm_add_ps(t0, vConstants);
4071 t0 = _mm_mul_ps(t0, root);
4072
4073 __m128 t1 = _mm_sub_ps(g_XMPi, t0);
4074 t0 = _mm_and_ps(nonnegative, t0);
4075 t1 = _mm_andnot_ps(nonnegative, t1);
4076 t0 = _mm_or_ps(t0, t1);
4077 t0 = _mm_sub_ps(g_XMHalfPi, t0);
4078 return t0;
4079#else // _XM_VMX128_INTRINSICS_
4080#endif // _XM_VMX128_INTRINSICS_
4081}
4082
4083//------------------------------------------------------------------------------
4084
4085inline XMVECTOR XMVectorACos
4086(
4087 FXMVECTOR V
4088)
4089{
4090 // 7-degree minimax approximation
4091
4092#if defined(_XM_NO_INTRINSICS_)
4093 XMVECTOR Result;
4094 Result.vector4_f32[0] = XMScalarACos( V.vector4_f32[0] );
4095 Result.vector4_f32[1] = XMScalarACos( V.vector4_f32[1] );
4096 Result.vector4_f32[2] = XMScalarACos( V.vector4_f32[2] );
4097 Result.vector4_f32[3] = XMScalarACos( V.vector4_f32[3] );
4098 return Result;
4099#elif defined(_XM_ARM_NEON_INTRINSICS_)
4100 __n128 nonnegative = vcgeq_f32(V, g_XMZero);
4101 __n128 x = vabsq_f32(V);
4102
4103 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
4104 __n128 oneMValue = vsubq_f32(g_XMOne, x);
4105 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
4106 __n128 root = XMVectorSqrt(clampOneMValue);
4107
4108 // Compute polynomial approximation
4109 const XMVECTOR AC1 = g_XMArcCoefficients1;
4110 __n128 t0 = vdupq_lane_f32(vget_high_f32(AC1), 1);
4111
4112 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AC1), 0);
4113 t0 = vmlaq_f32( vConstants, t0, x );
4114
4115 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 1);
4116 t0 = vmlaq_f32( vConstants, t0, x );
4117
4118 vConstants = vdupq_lane_f32(vget_low_f32(AC1), 0);
4119 t0 = vmlaq_f32( vConstants, t0, x );
4120
4121 const XMVECTOR AC0 = g_XMArcCoefficients0;
4122 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 1);
4123 t0 = vmlaq_f32( vConstants, t0, x );
4124
4125 vConstants = vdupq_lane_f32(vget_high_f32(AC0), 0);
4126 t0 = vmlaq_f32( vConstants, t0, x );
4127
4128 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 1);
4129 t0 = vmlaq_f32( vConstants, t0, x );
4130
4131 vConstants = vdupq_lane_f32(vget_low_f32(AC0), 0);
4132 t0 = vmlaq_f32( vConstants, t0, x );
4133 t0 = vmulq_f32(t0, root);
4134
4135 __n128 t1 = vsubq_f32(g_XMPi, t0);
4136 t0 = vbslq_f32( nonnegative, t0, t1 );
4137 return t0;
4138#elif defined(_XM_SSE_INTRINSICS_)
4139 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
4140 __m128 mvalue = _mm_sub_ps(g_XMZero, V);
4141 __m128 x = _mm_max_ps(V, mvalue); // |V|
4142
4143 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
4144 __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
4145 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
4146 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
4147
4148 // Compute polynomial approximation
4149 const XMVECTOR AC1 = g_XMArcCoefficients1;
4150 XMVECTOR vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(3, 3, 3, 3) );
4151 __m128 t0 = _mm_mul_ps(vConstants, x);
4152
4153 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(2, 2, 2, 2) );
4154 t0 = _mm_add_ps(t0, vConstants);
4155 t0 = _mm_mul_ps(t0, x);
4156
4157 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(1, 1, 1, 1) );
4158 t0 = _mm_add_ps(t0, vConstants);
4159 t0 = _mm_mul_ps(t0, x);
4160
4161 vConstants = XM_PERMUTE_PS( AC1, _MM_SHUFFLE(0, 0, 0, 0) );
4162 t0 = _mm_add_ps(t0, vConstants);
4163 t0 = _mm_mul_ps(t0, x);
4164
4165 const XMVECTOR AC0 = g_XMArcCoefficients0;
4166 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(3, 3, 3, 3) );
4167 t0 = _mm_add_ps(t0, vConstants);
4168 t0 = _mm_mul_ps(t0, x);
4169
4170 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(2, 2, 2, 2) );
4171 t0 = _mm_add_ps(t0, vConstants);
4172 t0 = _mm_mul_ps(t0, x);
4173
4174 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(1, 1, 1, 1) );
4175 t0 = _mm_add_ps(t0, vConstants);
4176 t0 = _mm_mul_ps(t0, x);
4177
4178 vConstants = XM_PERMUTE_PS( AC0, _MM_SHUFFLE(0, 0, 0, 0) );
4179 t0 = _mm_add_ps(t0, vConstants);
4180 t0 = _mm_mul_ps(t0, root);
4181
4182 __m128 t1 = _mm_sub_ps(g_XMPi, t0);
4183 t0 = _mm_and_ps(nonnegative, t0);
4184 t1 = _mm_andnot_ps(nonnegative, t1);
4185 t0 = _mm_or_ps(t0, t1);
4186 return t0;
4187#else // _XM_VMX128_INTRINSICS_
4188#endif // _XM_VMX128_INTRINSICS_
4189}
4190
4191//------------------------------------------------------------------------------
4192
4193inline XMVECTOR XMVectorATan
4194(
4195 FXMVECTOR V
4196)
4197{
4198 // 17-degree minimax approximation
4199
4200#if defined(_XM_NO_INTRINSICS_)
4201 XMVECTOR Result;
4202 Result.vector4_f32[0] = atanf( V.vector4_f32[0] );
4203 Result.vector4_f32[1] = atanf( V.vector4_f32[1] );
4204 Result.vector4_f32[2] = atanf( V.vector4_f32[2] );
4205 Result.vector4_f32[3] = atanf( V.vector4_f32[3] );
4206 return Result;
4207#elif defined(_XM_ARM_NEON_INTRINSICS_)
4208 __n128 absV = vabsq_f32(V);
4209 __n128 invV = XMVectorReciprocal(V);
4210 __n128 comp = vcgtq_f32(V, g_XMOne);
4211 __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne);
4212 comp = vcleq_f32(absV, g_XMOne);
4213 sign = vbslq_f32(comp, g_XMZero, sign);
4214 __n128 x = vbslq_f32(comp, V, invV);
4215
4216 __n128 x2 = vmulq_f32(x, x);
4217
4218 // Compute polynomial approximation
4219 const XMVECTOR TC1 = g_XMATanCoefficients1;
4220 __n128 Result = vdupq_lane_f32(vget_high_f32(TC1), 1);
4221
4222 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(TC1), 0);
4223 Result = vmlaq_f32( vConstants, Result, x2 );
4224
4225 vConstants = vdupq_lane_f32(vget_low_f32(TC1), 1);
4226 Result = vmlaq_f32( vConstants, Result, x2 );
4227
4228 vConstants = vdupq_lane_f32(vget_low_f32(TC1), 0);
4229 Result = vmlaq_f32( vConstants, Result, x2 );
4230
4231 const XMVECTOR TC0 = g_XMATanCoefficients0;
4232 vConstants = vdupq_lane_f32(vget_high_f32(TC0), 1);
4233 Result = vmlaq_f32( vConstants, Result, x2 );
4234
4235 vConstants = vdupq_lane_f32(vget_high_f32(TC0), 0);
4236 Result = vmlaq_f32( vConstants, Result, x2 );
4237
4238 vConstants = vdupq_lane_f32(vget_low_f32(TC0), 1);
4239 Result = vmlaq_f32( vConstants, Result, x2 );
4240
4241 vConstants = vdupq_lane_f32(vget_low_f32(TC0), 0);
4242 Result = vmlaq_f32( vConstants, Result, x2 );
4243
4244 Result = vmlaq_f32( g_XMOne, Result, x2 );
4245 Result = vmulq_f32( Result, x );
4246
4247 __n128 result1 = vmulq_f32(sign, g_XMHalfPi);
4248 result1 = vsubq_f32(result1, Result);
4249
4250 comp = vceqq_f32(sign, g_XMZero);
4251 Result = vbslq_f32( comp, Result, result1 );
4252 return Result;
4253#elif defined(_XM_SSE_INTRINSICS_)
4254 __m128 absV = XMVectorAbs(V);
4255 __m128 invV = _mm_div_ps(g_XMOne, V);
4256 __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
4257 __m128 select0 = _mm_and_ps(comp, g_XMOne);
4258 __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
4259 __m128 sign = _mm_or_ps(select0, select1);
4260 comp = _mm_cmple_ps(absV, g_XMOne);
4261 select0 = _mm_and_ps(comp, g_XMZero);
4262 select1 = _mm_andnot_ps(comp, sign);
4263 sign = _mm_or_ps(select0, select1);
4264 select0 = _mm_and_ps(comp, V);
4265 select1 = _mm_andnot_ps(comp, invV);
4266 __m128 x = _mm_or_ps(select0, select1);
4267
4268 __m128 x2 = _mm_mul_ps(x, x);
4269
4270 // Compute polynomial approximation
4271 const XMVECTOR TC1 = g_XMATanCoefficients1;
4272 XMVECTOR vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(3, 3, 3, 3) );
4273 __m128 Result = _mm_mul_ps(vConstants, x2);
4274
4275 vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(2, 2, 2, 2) );
4276 Result = _mm_add_ps(Result, vConstants);
4277 Result = _mm_mul_ps(Result, x2);
4278
4279 vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(1, 1, 1, 1) );
4280 Result = _mm_add_ps(Result, vConstants);
4281 Result = _mm_mul_ps(Result, x2);
4282
4283 vConstants = XM_PERMUTE_PS( TC1, _MM_SHUFFLE(0, 0, 0, 0) );
4284 Result = _mm_add_ps(Result, vConstants);
4285 Result = _mm_mul_ps(Result, x2);
4286
4287 const XMVECTOR TC0 = g_XMATanCoefficients0;
4288 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(3, 3, 3, 3) );
4289 Result = _mm_add_ps(Result, vConstants);
4290 Result = _mm_mul_ps(Result, x2);
4291
4292 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(2, 2, 2, 2) );
4293 Result = _mm_add_ps(Result, vConstants);
4294 Result = _mm_mul_ps(Result, x2);
4295
4296 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(1, 1, 1, 1) );
4297 Result = _mm_add_ps(Result, vConstants);
4298 Result = _mm_mul_ps(Result, x2);
4299
4300 vConstants = XM_PERMUTE_PS( TC0, _MM_SHUFFLE(0, 0, 0, 0) );
4301 Result = _mm_add_ps(Result, vConstants);
4302 Result = _mm_mul_ps(Result, x2);
4303 Result = _mm_add_ps(Result, g_XMOne);
4304 Result = _mm_mul_ps(Result, x);
4305 __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
4306 result1 = _mm_sub_ps(result1, Result);
4307
4308 comp = _mm_cmpeq_ps(sign, g_XMZero);
4309 select0 = _mm_and_ps(comp, Result);
4310 select1 = _mm_andnot_ps(comp, result1);
4311 Result = _mm_or_ps(select0, select1);
4312 return Result;
4313#else // _XM_VMX128_INTRINSICS_
4314#endif // _XM_VMX128_INTRINSICS_
4315}
4316
4317//------------------------------------------------------------------------------
4318
4319inline XMVECTOR XMVectorATan2
4320(
4321 FXMVECTOR Y,
4322 FXMVECTOR X
4323)
4324{
4325 // Return the inverse tangent of Y / X in the range of -Pi to Pi with the following exceptions:
4326
4327 // Y == 0 and X is Negative -> Pi with the sign of Y
4328 // y == 0 and x is positive -> 0 with the sign of y
4329 // Y != 0 and X == 0 -> Pi / 2 with the sign of Y
4330 // Y != 0 and X is Negative -> atan(y/x) + (PI with the sign of Y)
4331 // X == -Infinity and Finite Y -> Pi with the sign of Y
4332 // X == +Infinity and Finite Y -> 0 with the sign of Y
4333 // Y == Infinity and X is Finite -> Pi / 2 with the sign of Y
4334 // Y == Infinity and X == -Infinity -> 3Pi / 4 with the sign of Y
4335 // Y == Infinity and X == +Infinity -> Pi / 4 with the sign of Y
4336
4337 static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, XM_PI * 3.0f / 4.0f};
4338
4339 XMVECTOR Zero = XMVectorZero();
4340 XMVECTOR ATanResultValid = XMVectorTrueInt();
4341
4342 XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
4343 XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
4344 XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
4345 XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
4346
4347 XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
4348 XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
4349 XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
4350 XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
4351 XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
4352 XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
4353
4354 XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
4355 Pi = XMVectorOrInt(Pi, YSign);
4356 PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
4357 PiOverFour = XMVectorOrInt(PiOverFour, YSign);
4358 ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
4359
4360 XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
4361 XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
4362 XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
4363 XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
4364 XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
4365 XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
4366 ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
4367
4368 XMVECTOR V = XMVectorDivide(Y, X);
4369
4370 XMVECTOR R0 = XMVectorATan(V);
4371
4372 R1 = XMVectorSelect( Pi, Zero, XIsPositive );
4373 R2 = XMVectorAdd(R0, R1);
4374
4375 return XMVectorSelect(Result, R2, ATanResultValid);
4376}
4377
4378//------------------------------------------------------------------------------
4379
4380inline XMVECTOR XMVectorSinEst
4381(
4382 FXMVECTOR V
4383)
4384{
4385 // 7-degree minimax approximation
4386
4387#if defined(_XM_NO_INTRINSICS_)
4388 XMVECTOR Result;
4389 Result.vector4_f32[0] = XMScalarSinEst( V.vector4_f32[0] );
4390 Result.vector4_f32[1] = XMScalarSinEst( V.vector4_f32[1] );
4391 Result.vector4_f32[2] = XMScalarSinEst( V.vector4_f32[2] );
4392 Result.vector4_f32[3] = XMScalarSinEst( V.vector4_f32[3] );
4393 return Result;
4394#elif defined(_XM_ARM_NEON_INTRINSICS_)
4395 // Force the value within the bounds of pi
4396 XMVECTOR x = XMVectorModAngles(V);
4397
4398 // Map in [-pi/2,pi/2] with sin(y) = sin(x).
4399 __n128 sign = vandq_u32(x, g_XMNegativeZero);
4400 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
4401 __n128 absx = vabsq_f32( x );
4402 __n128 rflx = vsubq_f32(c, x);
4403 __n128 comp = vcleq_f32(absx, g_XMHalfPi);
4404 x = vbslq_f32( comp, x, rflx );
4405
4406 __n128 x2 = vmulq_f32(x, x);
4407
4408 // Compute polynomial approximation
4409 const XMVECTOR SEC = g_XMSinCoefficients1;
4410 XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1);
4411
4412 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
4413 Result = vmlaq_f32(vConstants, Result, x2);
4414
4415 vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
4416 Result = vmlaq_f32(vConstants, Result, x2);
4417
4418 Result = vmlaq_f32(g_XMOne, Result, x2);
4419 Result = vmulq_f32(Result, x);
4420 return Result;
4421#elif defined(_XM_SSE_INTRINSICS_)
4422 // Force the value within the bounds of pi
4423 XMVECTOR x = XMVectorModAngles(V);
4424
4425 // Map in [-pi/2,pi/2] with sin(y) = sin(x).
4426 __m128 sign = _mm_and_ps(x, g_XMNegativeZero);
4427 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
4428 __m128 absx = _mm_andnot_ps(sign, x); // |x|
4429 __m128 rflx = _mm_sub_ps(c, x);
4430 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
4431 __m128 select0 = _mm_and_ps(comp, x);
4432 __m128 select1 = _mm_andnot_ps(comp, rflx);
4433 x = _mm_or_ps(select0, select1);
4434
4435 __m128 x2 = _mm_mul_ps(x, x);
4436
4437 // Compute polynomial approximation
4438 const XMVECTOR SEC = g_XMSinCoefficients1;
4439 XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
4440 __m128 Result = _mm_mul_ps(vConstants, x2);
4441
4442 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
4443 Result = _mm_add_ps(Result, vConstants);
4444 Result = _mm_mul_ps(Result, x2);
4445
4446 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
4447 Result = _mm_add_ps(Result, vConstants);
4448 Result = _mm_mul_ps(Result, x2);
4449
4450 Result = _mm_add_ps(Result, g_XMOne);
4451 Result = _mm_mul_ps(Result, x);
4452 return Result;
4453#else // _XM_VMX128_INTRINSICS_
4454#endif // _XM_VMX128_INTRINSICS_
4455}
4456
4457//------------------------------------------------------------------------------
4458
4459inline XMVECTOR XMVectorCosEst
4460(
4461 FXMVECTOR V
4462)
4463{
4464 // 6-degree minimax approximation
4465
4466#if defined(_XM_NO_INTRINSICS_)
4467 XMVECTOR Result;
4468 Result.vector4_f32[0] = XMScalarCosEst( V.vector4_f32[0] );
4469 Result.vector4_f32[1] = XMScalarCosEst( V.vector4_f32[1] );
4470 Result.vector4_f32[2] = XMScalarCosEst( V.vector4_f32[2] );
4471 Result.vector4_f32[3] = XMScalarCosEst( V.vector4_f32[3] );
4472 return Result;
4473#elif defined(_XM_ARM_NEON_INTRINSICS_)
4474 // Map V to x in [-pi,pi].
4475 XMVECTOR x = XMVectorModAngles(V);
4476
4477 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
4478 __n128 sign = vandq_u32(x, g_XMNegativeZero);
4479 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
4480 __n128 absx = vabsq_f32( x );
4481 __n128 rflx = vsubq_f32(c, x);
4482 __n128 comp = vcleq_f32(absx, g_XMHalfPi);
4483 x = vbslq_f32( comp, x, rflx );
4484 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
4485
4486 __n128 x2 = vmulq_f32(x, x);
4487
4488 // Compute polynomial approximation
4489 const XMVECTOR CEC = g_XMCosCoefficients1;
4490 XMVECTOR Result = vdupq_lane_f32(vget_high_f32(CEC), 1);
4491
4492 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
4493 Result = vmlaq_f32(vConstants, Result, x2);
4494
4495 vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
4496 Result = vmlaq_f32(vConstants, Result, x2);
4497
4498 Result = vmlaq_f32(g_XMOne, Result, x2);
4499 Result = vmulq_f32(Result, sign);
4500 return Result;
4501#elif defined(_XM_SSE_INTRINSICS_)
4502 // Map V to x in [-pi,pi].
4503 XMVECTOR x = XMVectorModAngles(V);
4504
4505 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
4506 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
4507 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
4508 __m128 absx = _mm_andnot_ps(sign, x); // |x|
4509 __m128 rflx = _mm_sub_ps(c, x);
4510 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
4511 __m128 select0 = _mm_and_ps(comp, x);
4512 __m128 select1 = _mm_andnot_ps(comp, rflx);
4513 x = _mm_or_ps(select0, select1);
4514 select0 = _mm_and_ps(comp, g_XMOne);
4515 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
4516 sign = _mm_or_ps(select0, select1);
4517
4518 __m128 x2 = _mm_mul_ps(x, x);
4519
4520 // Compute polynomial approximation
4521 const XMVECTOR CEC = g_XMCosCoefficients1;
4522 XMVECTOR vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
4523 __m128 Result = _mm_mul_ps(vConstants, x2);
4524
4525 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
4526 Result = _mm_add_ps(Result, vConstants);
4527 Result = _mm_mul_ps(Result, x2);
4528
4529 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
4530 Result = _mm_add_ps(Result, vConstants);
4531 Result = _mm_mul_ps(Result, x2);
4532
4533 Result = _mm_add_ps(Result, g_XMOne);
4534 Result = _mm_mul_ps(Result, sign);
4535 return Result;
4536#else // _XM_VMX128_INTRINSICS_
4537#endif // _XM_VMX128_INTRINSICS_
4538}
4539
4540//------------------------------------------------------------------------------
4541
4542_Use_decl_annotations_
4543inline void XMVectorSinCosEst
4544(
4545 XMVECTOR* pSin,
4546 XMVECTOR* pCos,
4547 FXMVECTOR V
4548)
4549{
4550 assert(pSin != NULL);
4551 assert(pCos != NULL);
4552
4553 // 7/6-degree minimax approximation
4554
4555#if defined(_XM_NO_INTRINSICS_)
4556 XMVECTOR Sin;
4557 XMVECTOR Cos;
4558
4559 XMScalarSinCosEst(&Sin.vector4_f32[0], &Cos.vector4_f32[0], V.vector4_f32[0]);
4560 XMScalarSinCosEst(&Sin.vector4_f32[1], &Cos.vector4_f32[1], V.vector4_f32[1]);
4561 XMScalarSinCosEst(&Sin.vector4_f32[2], &Cos.vector4_f32[2], V.vector4_f32[2]);
4562 XMScalarSinCosEst(&Sin.vector4_f32[3], &Cos.vector4_f32[3], V.vector4_f32[3]);
4563
4564 *pSin = Sin;
4565 *pCos = Cos;
4566#elif defined(_XM_ARM_NEON_INTRINSICS_)
4567 // Force the value within the bounds of pi
4568 XMVECTOR x = XMVectorModAngles(V);
4569
4570 // Map in [-pi/2,pi/2] with cos(y) = sign*cos(x).
4571 __n128 sign = vandq_u32(x, g_XMNegativeZero);
4572 __n128 c = vorrq_u32(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
4573 __n128 absx = vabsq_f32( x );
4574 __n128 rflx = vsubq_f32(c, x);
4575 __n128 comp = vcleq_f32(absx, g_XMHalfPi);
4576 x = vbslq_f32( comp, x, rflx );
4577 sign = vbslq_f32( comp, g_XMOne, g_XMNegativeOne );
4578
4579 __n128 x2 = vmulq_f32(x, x);
4580
4581 // Compute polynomial approximation for sine
4582 const XMVECTOR SEC = g_XMSinCoefficients1;
4583 XMVECTOR Result = vdupq_lane_f32(vget_high_f32(SEC), 1);
4584
4585 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(SEC), 0);
4586 Result = vmlaq_f32(vConstants, Result, x2);
4587
4588 vConstants = vdupq_lane_f32(vget_low_f32(SEC), 1);
4589 Result = vmlaq_f32(vConstants, Result, x2);
4590
4591 Result = vmlaq_f32(g_XMOne, Result, x2);
4592 *pSin = vmulq_f32(Result, x);
4593
4594 // Compute polynomial approximation
4595 const XMVECTOR CEC = g_XMCosCoefficients1;
4596 Result = vdupq_lane_f32(vget_high_f32(CEC), 1);
4597
4598 vConstants = vdupq_lane_f32(vget_high_f32(CEC), 0);
4599 Result = vmlaq_f32(vConstants, Result, x2);
4600
4601 vConstants = vdupq_lane_f32(vget_low_f32(CEC), 1);
4602 Result = vmlaq_f32(vConstants, Result, x2);
4603
4604 Result = vmlaq_f32(g_XMOne, Result, x2);
4605 *pCos = vmulq_f32(Result, sign);
4606#elif defined(_XM_SSE_INTRINSICS_)
4607 // Force the value within the bounds of pi
4608 XMVECTOR x = XMVectorModAngles(V);
4609
4610 // Map in [-pi/2,pi/2] with sin(y) = sin(x), cos(y) = sign*cos(x).
4611 XMVECTOR sign = _mm_and_ps(x, g_XMNegativeZero);
4612 __m128 c = _mm_or_ps(g_XMPi, sign); // pi when x >= 0, -pi when x < 0
4613 __m128 absx = _mm_andnot_ps(sign, x); // |x|
4614 __m128 rflx = _mm_sub_ps(c, x);
4615 __m128 comp = _mm_cmple_ps(absx, g_XMHalfPi);
4616 __m128 select0 = _mm_and_ps(comp, x);
4617 __m128 select1 = _mm_andnot_ps(comp, rflx);
4618 x = _mm_or_ps(select0, select1);
4619 select0 = _mm_and_ps(comp, g_XMOne);
4620 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
4621 sign = _mm_or_ps(select0, select1);
4622
4623 __m128 x2 = _mm_mul_ps(x, x);
4624
4625 // Compute polynomial approximation for sine
4626 const XMVECTOR SEC = g_XMSinCoefficients1;
4627 XMVECTOR vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(3, 3, 3, 3) );
4628 __m128 Result = _mm_mul_ps(vConstants, x2);
4629
4630 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(2, 2, 2, 2) );
4631 Result = _mm_add_ps(Result, vConstants);
4632 Result = _mm_mul_ps(Result, x2);
4633
4634 vConstants = XM_PERMUTE_PS( SEC, _MM_SHUFFLE(1, 1, 1, 1) );
4635 Result = _mm_add_ps(Result, vConstants);
4636 Result = _mm_mul_ps(Result, x2);
4637
4638 Result = _mm_add_ps(Result, g_XMOne);
4639 Result = _mm_mul_ps(Result, x);
4640 *pSin = Result;
4641
4642 // Compute polynomial approximation for cosine
4643 const XMVECTOR CEC = g_XMCosCoefficients1;
4644 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(3, 3, 3, 3) );
4645 Result = _mm_mul_ps(vConstants, x2);
4646
4647 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(2, 2, 2, 2) );
4648 Result = _mm_add_ps(Result, vConstants);
4649 Result = _mm_mul_ps(Result, x2);
4650
4651 vConstants = XM_PERMUTE_PS( CEC, _MM_SHUFFLE(1, 1, 1, 1) );
4652 Result = _mm_add_ps(Result, vConstants);
4653 Result = _mm_mul_ps(Result, x2);
4654
4655 Result = _mm_add_ps(Result, g_XMOne);
4656 Result = _mm_mul_ps(Result, sign);
4657 *pCos = Result;
4658#else // _XM_VMX128_INTRINSICS_
4659#endif // _XM_VMX128_INTRINSICS_
4660}
4661
4662//------------------------------------------------------------------------------
4663
4664inline XMVECTOR XMVectorTanEst
4665(
4666 FXMVECTOR V
4667)
4668{
4669 XMVECTOR OneOverPi = XMVectorSplatW(g_XMTanEstCoefficients.v);
4670
4671 XMVECTOR V1 = XMVectorMultiply(V, OneOverPi);
4672 V1 = XMVectorRound(V1);
4673
4674 V1 = XMVectorNegativeMultiplySubtract(g_XMPi.v, V1, V);
4675
4676 XMVECTOR T0 = XMVectorSplatX(g_XMTanEstCoefficients.v);
4677 XMVECTOR T1 = XMVectorSplatY(g_XMTanEstCoefficients.v);
4678 XMVECTOR T2 = XMVectorSplatZ(g_XMTanEstCoefficients.v);
4679
4680 XMVECTOR V2T2 = XMVectorNegativeMultiplySubtract(V1, V1, T2);
4681 XMVECTOR V2 = XMVectorMultiply(V1, V1);
4682 XMVECTOR V1T0 = XMVectorMultiply(V1, T0);
4683 XMVECTOR V1T1 = XMVectorMultiply(V1, T1);
4684
4685 XMVECTOR D = XMVectorReciprocalEst(V2T2);
4686 XMVECTOR N = XMVectorMultiplyAdd(V2, V1T1, V1T0);
4687
4688 return XMVectorMultiply(N, D);
4689}
4690
4691
4692//------------------------------------------------------------------------------
4693
4694inline XMVECTOR XMVectorASinEst
4695(
4696 FXMVECTOR V
4697)
4698{
4699 // 3-degree minimax approximation
4700
4701#if defined(_XM_NO_INTRINSICS_)
4702 XMVECTOR Result;
4703 Result.vector4_f32[0] = XMScalarASinEst( V.vector4_f32[0] );
4704 Result.vector4_f32[1] = XMScalarASinEst( V.vector4_f32[1] );
4705 Result.vector4_f32[2] = XMScalarASinEst( V.vector4_f32[2] );
4706 Result.vector4_f32[3] = XMScalarASinEst( V.vector4_f32[3] );
4707 return Result;
4708#elif defined(_XM_ARM_NEON_INTRINSICS_)
4709 __n128 nonnegative = vcgeq_f32(V, g_XMZero);
4710 __n128 x = vabsq_f32(V);
4711
4712 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
4713 __n128 oneMValue = vsubq_f32(g_XMOne, x);
4714 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
4715 __n128 root = XMVectorSqrt(clampOneMValue);
4716
4717 // Compute polynomial approximation
4718 const XMVECTOR AEC = g_XMArcEstCoefficients;
4719 __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1);
4720
4721 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
4722 t0 = vmlaq_f32( vConstants, t0, x );
4723
4724 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
4725 t0 = vmlaq_f32( vConstants, t0, x );
4726
4727 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
4728 t0 = vmlaq_f32( vConstants, t0, x );
4729 t0 = vmulq_f32(t0, root);
4730
4731 __n128 t1 = vsubq_f32(g_XMPi, t0);
4732 t0 = vbslq_f32( nonnegative, t0, t1 );
4733 t0 = vsubq_f32(g_XMHalfPi, t0);
4734 return t0;
4735#elif defined(_XM_SSE_INTRINSICS_)
4736 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
4737 __m128 mvalue = _mm_sub_ps(g_XMZero, V);
4738 __m128 x = _mm_max_ps(V, mvalue); // |V|
4739
4740 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
4741 __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
4742 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
4743 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
4744
4745 // Compute polynomial approximation
4746 const XMVECTOR AEC = g_XMArcEstCoefficients;
4747 XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
4748 __m128 t0 = _mm_mul_ps(vConstants, x);
4749
4750 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
4751 t0 = _mm_add_ps(t0, vConstants);
4752 t0 = _mm_mul_ps(t0, x);
4753
4754 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
4755 t0 = _mm_add_ps(t0, vConstants);
4756 t0 = _mm_mul_ps(t0, x);
4757
4758 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
4759 t0 = _mm_add_ps(t0, vConstants);
4760 t0 = _mm_mul_ps(t0, root);
4761
4762 __m128 t1 = _mm_sub_ps(g_XMPi, t0);
4763 t0 = _mm_and_ps(nonnegative, t0);
4764 t1 = _mm_andnot_ps(nonnegative, t1);
4765 t0 = _mm_or_ps(t0, t1);
4766 t0 = _mm_sub_ps(g_XMHalfPi, t0);
4767 return t0;
4768#else // _XM_VMX128_INTRINSICS_
4769#endif // _XM_VMX128_INTRINSICS_
4770}
4771
4772//------------------------------------------------------------------------------
4773
4774inline XMVECTOR XMVectorACosEst
4775(
4776 FXMVECTOR V
4777)
4778{
4779 // 3-degree minimax approximation
4780
4781#if defined(_XM_NO_INTRINSICS_)
4782 XMVECTOR Result;
4783 Result.vector4_f32[0] = XMScalarACosEst( V.vector4_f32[0] );
4784 Result.vector4_f32[1] = XMScalarACosEst( V.vector4_f32[1] );
4785 Result.vector4_f32[2] = XMScalarACosEst( V.vector4_f32[2] );
4786 Result.vector4_f32[3] = XMScalarACosEst( V.vector4_f32[3] );
4787 return Result;
4788#elif defined(_XM_ARM_NEON_INTRINSICS_)
4789 __n128 nonnegative = vcgeq_f32(V, g_XMZero);
4790 __n128 x = vabsq_f32(V);
4791
4792 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
4793 __n128 oneMValue = vsubq_f32(g_XMOne, x);
4794 __n128 clampOneMValue = vmaxq_f32(g_XMZero, oneMValue);
4795 __n128 root = XMVectorSqrt(clampOneMValue);
4796
4797 // Compute polynomial approximation
4798 const XMVECTOR AEC = g_XMArcEstCoefficients;
4799 __n128 t0 = vdupq_lane_f32(vget_high_f32(AEC), 1);
4800
4801 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
4802 t0 = vmlaq_f32( vConstants, t0, x );
4803
4804 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
4805 t0 = vmlaq_f32( vConstants, t0, x );
4806
4807 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 0);
4808 t0 = vmlaq_f32( vConstants, t0, x );
4809 t0 = vmulq_f32(t0, root);
4810
4811 __n128 t1 = vsubq_f32(g_XMPi, t0);
4812 t0 = vbslq_f32( nonnegative, t0, t1 );
4813 return t0;
4814#elif defined(_XM_SSE_INTRINSICS_)
4815 __m128 nonnegative = _mm_cmpge_ps(V, g_XMZero);
4816 __m128 mvalue = _mm_sub_ps(g_XMZero, V);
4817 __m128 x = _mm_max_ps(V, mvalue); // |V|
4818
4819 // Compute (1-|V|), clamp to zero to avoid sqrt of negative number.
4820 __m128 oneMValue = _mm_sub_ps(g_XMOne, x);
4821 __m128 clampOneMValue = _mm_max_ps(g_XMZero, oneMValue);
4822 __m128 root = _mm_sqrt_ps(clampOneMValue); // sqrt(1-|V|)
4823
4824 // Compute polynomial approximation
4825 const XMVECTOR AEC = g_XMArcEstCoefficients;
4826 XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
4827 __m128 t0 = _mm_mul_ps(vConstants, x);
4828
4829 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
4830 t0 = _mm_add_ps(t0, vConstants);
4831 t0 = _mm_mul_ps(t0, x);
4832
4833 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
4834 t0 = _mm_add_ps(t0, vConstants);
4835 t0 = _mm_mul_ps(t0, x);
4836
4837 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
4838 t0 = _mm_add_ps(t0, vConstants);
4839 t0 = _mm_mul_ps(t0, root);
4840
4841 __m128 t1 = _mm_sub_ps(g_XMPi, t0);
4842 t0 = _mm_and_ps(nonnegative, t0);
4843 t1 = _mm_andnot_ps(nonnegative, t1);
4844 t0 = _mm_or_ps(t0, t1);
4845 return t0;
4846#else // _XM_VMX128_INTRINSICS_
4847#endif // _XM_VMX128_INTRINSICS_
4848}
4849
4850//------------------------------------------------------------------------------
4851
4852namespace Internal
4853{
4854
4855inline float XMScalarATanEst
4856(
4857 float Value
4858)
4859{
4860 float y, sign;
4861 if (fabsf(Value) <= 1.0f)
4862 {
4863 y = Value;
4864 sign = 0.0f;
4865 }
4866 else if (Value > 1.0f)
4867 {
4868 y = 1.0f / Value;
4869 sign = 1.0f;
4870 }
4871 else
4872 {
4873 y = 1.0f / Value;
4874 sign = -1.0f;
4875 }
4876
4877 // 9-degree minimax approximation
4878 float y2 = y*y;
4879 float poly = ((((0.0208351f*y2-0.085133f)*y2+0.180141f)*y2-0.3302995f)*y2+0.999866f)*y;
4880
4881 return (sign == 0.0f ? poly : sign*XM_PIDIV2 - poly);
4882}
4883
4884}; // namespace Internal
4885
4886//------------------------------------------------------------------------------
4887
4888inline XMVECTOR XMVectorATanEst
4889(
4890 FXMVECTOR V
4891)
4892{
4893 // 9-degree minimax approximation
4894
4895#if defined(_XM_NO_INTRINSICS_)
4896 XMVECTOR Result;
4897 Result.vector4_f32[0] = Internal::XMScalarATanEst( V.vector4_f32[0] );
4898 Result.vector4_f32[1] = Internal::XMScalarATanEst( V.vector4_f32[1] );
4899 Result.vector4_f32[2] = Internal::XMScalarATanEst( V.vector4_f32[2] );
4900 Result.vector4_f32[3] = Internal::XMScalarATanEst( V.vector4_f32[3] );
4901 return Result;
4902#elif defined(_XM_ARM_NEON_INTRINSICS_)
4903 __n128 absV = vabsq_f32(V);
4904 __n128 invV = XMVectorReciprocalEst(V);
4905 __n128 comp = vcgtq_f32(V, g_XMOne);
4906 __n128 sign = vbslq_f32(comp, g_XMOne, g_XMNegativeOne );
4907 comp = vcleq_f32(absV, g_XMOne);
4908 sign = vbslq_f32(comp, g_XMZero, sign );
4909 __n128 x = vbslq_f32(comp, V, invV );
4910
4911 __n128 x2 = vmulq_f32(x, x);
4912
4913 // Compute polynomial approximation
4914 const XMVECTOR AEC = g_XMATanEstCoefficients1;
4915 __n128 Result = vdupq_lane_f32(vget_high_f32(AEC), 1);
4916
4917 XMVECTOR vConstants = vdupq_lane_f32(vget_high_f32(AEC), 0);
4918 Result = vmlaq_f32( vConstants, Result, x2 );
4919
4920 vConstants = vdupq_lane_f32(vget_low_f32(AEC), 1);
4921 Result = vmlaq_f32( vConstants, Result, x2 );
4922
4923 vConstants = vdupq_lane_f32(vget_low_f32( AEC), 0);
4924 Result = vmlaq_f32( vConstants, Result, x2 );
4925
4926 // ATanEstCoefficients0 is already splatted
4927 Result = vmlaq_f32( g_XMATanEstCoefficients0, Result, x2 );
4928 Result = vmulq_f32( Result, x );
4929
4930 float32x4_t result1 = vmulq_f32(sign, g_XMHalfPi);
4931 result1 = vsubq_f32(result1, Result);
4932
4933 comp = vceqq_f32(sign, g_XMZero);
4934 Result = vbslq_f32( comp, Result, result1 );
4935 return Result;
4936#elif defined(_XM_SSE_INTRINSICS_)
4937 __m128 absV = XMVectorAbs(V);
4938 __m128 invV = _mm_div_ps(g_XMOne, V);
4939 __m128 comp = _mm_cmpgt_ps(V, g_XMOne);
4940 __m128 select0 = _mm_and_ps(comp, g_XMOne);
4941 __m128 select1 = _mm_andnot_ps(comp, g_XMNegativeOne);
4942 __m128 sign = _mm_or_ps(select0, select1);
4943 comp = _mm_cmple_ps(absV, g_XMOne);
4944 select0 = _mm_and_ps(comp, g_XMZero);
4945 select1 = _mm_andnot_ps(comp, sign);
4946 sign = _mm_or_ps(select0, select1);
4947 select0 = _mm_and_ps(comp, V);
4948 select1 = _mm_andnot_ps(comp, invV);
4949 __m128 x = _mm_or_ps(select0, select1);
4950
4951 __m128 x2 = _mm_mul_ps(x, x);
4952
4953 // Compute polynomial approximation
4954 const XMVECTOR AEC = g_XMATanEstCoefficients1;
4955 XMVECTOR vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(3, 3, 3, 3) );
4956 __m128 Result = _mm_mul_ps(vConstants, x2);
4957
4958 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(2, 2, 2, 2) );
4959 Result = _mm_add_ps(Result, vConstants);
4960 Result = _mm_mul_ps(Result, x2);
4961
4962 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(1, 1, 1, 1) );
4963 Result = _mm_add_ps(Result, vConstants);
4964 Result = _mm_mul_ps(Result, x2);
4965
4966 vConstants = XM_PERMUTE_PS( AEC, _MM_SHUFFLE(0, 0, 0, 0) );
4967 Result = _mm_add_ps(Result, vConstants);
4968 Result = _mm_mul_ps(Result, x2);
4969
4970 // ATanEstCoefficients0 is already splatted
4971 Result = _mm_add_ps(Result, g_XMATanEstCoefficients0);
4972 Result = _mm_mul_ps(Result, x);
4973 __m128 result1 = _mm_mul_ps(sign, g_XMHalfPi);
4974 result1 = _mm_sub_ps(result1, Result);
4975
4976 comp = _mm_cmpeq_ps(sign, g_XMZero);
4977 select0 = _mm_and_ps(comp, Result);
4978 select1 = _mm_andnot_ps(comp, result1);
4979 Result = _mm_or_ps(select0, select1);
4980 return Result;
4981#else // _XM_VMX128_INTRINSICS_
4982#endif // _XM_VMX128_INTRINSICS_
4983}
4984
4985//------------------------------------------------------------------------------
4986
4987inline XMVECTOR XMVectorATan2Est
4988(
4989 FXMVECTOR Y,
4990 FXMVECTOR X
4991)
4992{
4993 static const XMVECTORF32 ATan2Constants = {XM_PI, XM_PIDIV2, XM_PIDIV4, 2.3561944905f /* Pi*3/4 */};
4994
4995 const XMVECTOR Zero = XMVectorZero();
4996 XMVECTOR ATanResultValid = XMVectorTrueInt();
4997
4998 XMVECTOR Pi = XMVectorSplatX(ATan2Constants);
4999 XMVECTOR PiOverTwo = XMVectorSplatY(ATan2Constants);
5000 XMVECTOR PiOverFour = XMVectorSplatZ(ATan2Constants);
5001 XMVECTOR ThreePiOverFour = XMVectorSplatW(ATan2Constants);
5002
5003 XMVECTOR YEqualsZero = XMVectorEqual(Y, Zero);
5004 XMVECTOR XEqualsZero = XMVectorEqual(X, Zero);
5005 XMVECTOR XIsPositive = XMVectorAndInt(X, g_XMNegativeZero.v);
5006 XIsPositive = XMVectorEqualInt(XIsPositive, Zero);
5007 XMVECTOR YEqualsInfinity = XMVectorIsInfinite(Y);
5008 XMVECTOR XEqualsInfinity = XMVectorIsInfinite(X);
5009
5010 XMVECTOR YSign = XMVectorAndInt(Y, g_XMNegativeZero.v);
5011 Pi = XMVectorOrInt(Pi, YSign);
5012 PiOverTwo = XMVectorOrInt(PiOverTwo, YSign);
5013 PiOverFour = XMVectorOrInt(PiOverFour, YSign);
5014 ThreePiOverFour = XMVectorOrInt(ThreePiOverFour, YSign);
5015
5016 XMVECTOR R1 = XMVectorSelect(Pi, YSign, XIsPositive);
5017 XMVECTOR R2 = XMVectorSelect(ATanResultValid, PiOverTwo, XEqualsZero);
5018 XMVECTOR R3 = XMVectorSelect(R2, R1, YEqualsZero);
5019 XMVECTOR R4 = XMVectorSelect(ThreePiOverFour, PiOverFour, XIsPositive);
5020 XMVECTOR R5 = XMVectorSelect(PiOverTwo, R4, XEqualsInfinity);
5021 XMVECTOR Result = XMVectorSelect(R3, R5, YEqualsInfinity);
5022 ATanResultValid = XMVectorEqualInt(Result, ATanResultValid);
5023
5024 XMVECTOR Reciprocal = XMVectorReciprocalEst(X);
5025 XMVECTOR V = XMVectorMultiply(Y, Reciprocal);
5026 XMVECTOR R0 = XMVectorATanEst(V);
5027
5028 R1 = XMVectorSelect( Pi, Zero, XIsPositive );
5029 R2 = XMVectorAdd(R0, R1);
5030
5031 Result = XMVectorSelect(Result, R2, ATanResultValid);
5032
5033 return Result;
5034}
5035
5036//------------------------------------------------------------------------------
5037
5038inline XMVECTOR XMVectorLerp
5039(
5040 FXMVECTOR V0,
5041 FXMVECTOR V1,
5042 float t
5043)
5044{
5045 // V0 + t * (V1 - V0)
5046
5047#if defined(_XM_NO_INTRINSICS_)
5048
5049 XMVECTOR Scale = XMVectorReplicate(t);
5050 XMVECTOR Length = XMVectorSubtract(V1, V0);
5051 return XMVectorMultiplyAdd(Length, Scale, V0);
5052
5053#elif defined(_XM_ARM_NEON_INTRINSICS_)
5054 XMVECTOR L = vsubq_f32( V1, V0 );
5055 return vmlaq_n_f32( V0, L, t );
5056#elif defined(_XM_SSE_INTRINSICS_)
5057 XMVECTOR L = _mm_sub_ps( V1, V0 );
5058 XMVECTOR S = _mm_set_ps1( t );
5059 XMVECTOR Result = _mm_mul_ps( L, S );
5060 return _mm_add_ps( Result, V0 );
5061#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
5062#endif // _XM_VMX128_INTRINSICS_
5063}
5064
5065//------------------------------------------------------------------------------
5066
5067inline XMVECTOR XMVectorLerpV
5068(
5069 FXMVECTOR V0,
5070 FXMVECTOR V1,
5071 FXMVECTOR T
5072)
5073{
5074 // V0 + T * (V1 - V0)
5075
5076#if defined(_XM_NO_INTRINSICS_)
5077
5078 XMVECTOR Length = XMVectorSubtract(V1, V0);
5079 return XMVectorMultiplyAdd(Length, T, V0);
5080
5081#elif defined(_XM_ARM_NEON_INTRINSICS_)
5082 XMVECTOR L = vsubq_f32( V1, V0 );
5083 return vmlaq_f32( V0, L, T );
5084#elif defined(_XM_SSE_INTRINSICS_)
5085 XMVECTOR Length = _mm_sub_ps( V1, V0 );
5086 XMVECTOR Result = _mm_mul_ps( Length, T );
5087 return _mm_add_ps( Result, V0 );
5088#else // _XM_VMX128_INTRINSICS_
5089#endif // _XM_VMX128_INTRINSICS_
5090}
5091
5092//------------------------------------------------------------------------------
5093
5094inline XMVECTOR XMVectorHermite
5095(
5096 FXMVECTOR Position0,
5097 FXMVECTOR Tangent0,
5098 FXMVECTOR Position1,
5099 GXMVECTOR Tangent1,
5100 float t
5101)
5102{
5103 // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
5104 // (t^3 - 2 * t^2 + t) * Tangent0 +
5105 // (-2 * t^3 + 3 * t^2) * Position1 +
5106 // (t^3 - t^2) * Tangent1
5107
5108#if defined(_XM_NO_INTRINSICS_)
5109
5110 float t2 = t * t;
5111 float t3 = t * t2;
5112
5113 XMVECTOR P0 = XMVectorReplicate(2.0f * t3 - 3.0f * t2 + 1.0f);
5114 XMVECTOR T0 = XMVectorReplicate(t3 - 2.0f * t2 + t);
5115 XMVECTOR P1 = XMVectorReplicate(-2.0f * t3 + 3.0f * t2);
5116 XMVECTOR T1 = XMVectorReplicate(t3 - t2);
5117
5118 XMVECTOR Result = XMVectorMultiply(P0, Position0);
5119 Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
5120 Result = XMVectorMultiplyAdd(P1, Position1, Result);
5121 Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
5122
5123 return Result;
5124
5125#elif defined(_XM_ARM_NEON_INTRINSICS_)
5126 float t2 = t * t;
5127 float t3 = t * t2;
5128
5129 XMVECTOR P0 = vdupq_n_f32(2.0f * t3 - 3.0f * t2 + 1.0f);
5130 XMVECTOR T0 = vdupq_n_f32(t3 - 2.0f * t2 + t);
5131 XMVECTOR P1 = vdupq_n_f32(-2.0f * t3 + 3.0f * t2);
5132 XMVECTOR T1 = vdupq_n_f32(t3 - t2);
5133
5134 XMVECTOR vResult = vmulq_f32(P0, Position0);
5135 vResult = vmlaq_f32( vResult, T0, Tangent0 );
5136 vResult = vmlaq_f32( vResult, P1, Position1 );
5137 vResult = vmlaq_f32( vResult, T1, Tangent1 );
5138 return vResult;
5139#elif defined(_XM_SSE_INTRINSICS_)
5140 float t2 = t * t;
5141 float t3 = t * t2;
5142
5143 XMVECTOR P0 = _mm_set_ps1(2.0f * t3 - 3.0f * t2 + 1.0f);
5144 XMVECTOR T0 = _mm_set_ps1(t3 - 2.0f * t2 + t);
5145 XMVECTOR P1 = _mm_set_ps1(-2.0f * t3 + 3.0f * t2);
5146 XMVECTOR T1 = _mm_set_ps1(t3 - t2);
5147
5148 XMVECTOR vResult = _mm_mul_ps(P0, Position0);
5149 XMVECTOR vTemp = _mm_mul_ps(T0, Tangent0);
5150 vResult = _mm_add_ps(vResult,vTemp);
5151 vTemp = _mm_mul_ps(P1, Position1);
5152 vResult = _mm_add_ps(vResult,vTemp);
5153 vTemp = _mm_mul_ps(T1, Tangent1);
5154 vResult = _mm_add_ps(vResult,vTemp);
5155 return vResult;
5156#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
5157#endif // _XM_VMX128_INTRINSICS_
5158}
5159
5160//------------------------------------------------------------------------------
5161
5162inline XMVECTOR XMVectorHermiteV
5163(
5164 FXMVECTOR Position0,
5165 FXMVECTOR Tangent0,
5166 FXMVECTOR Position1,
5167 GXMVECTOR Tangent1,
5168 CXMVECTOR T
5169)
5170{
5171 // Result = (2 * t^3 - 3 * t^2 + 1) * Position0 +
5172 // (t^3 - 2 * t^2 + t) * Tangent0 +
5173 // (-2 * t^3 + 3 * t^2) * Position1 +
5174 // (t^3 - t^2) * Tangent1
5175
5176#if defined(_XM_NO_INTRINSICS_)
5177
5178 XMVECTOR T2 = XMVectorMultiply(T, T);
5179 XMVECTOR T3 = XMVectorMultiply(T , T2);
5180
5181 XMVECTOR P0 = XMVectorReplicate(2.0f * T3.vector4_f32[0] - 3.0f * T2.vector4_f32[0] + 1.0f);
5182 XMVECTOR T0 = XMVectorReplicate(T3.vector4_f32[1] - 2.0f * T2.vector4_f32[1] + T.vector4_f32[1]);
5183 XMVECTOR P1 = XMVectorReplicate(-2.0f * T3.vector4_f32[2] + 3.0f * T2.vector4_f32[2]);
5184 XMVECTOR T1 = XMVectorReplicate(T3.vector4_f32[3] - T2.vector4_f32[3]);
5185
5186 XMVECTOR Result = XMVectorMultiply(P0, Position0);
5187 Result = XMVectorMultiplyAdd(T0, Tangent0, Result);
5188 Result = XMVectorMultiplyAdd(P1, Position1, Result);
5189 Result = XMVectorMultiplyAdd(T1, Tangent1, Result);
5190
5191 return Result;
5192
5193#elif defined(_XM_ARM_NEON_INTRINSICS_)
5194 static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
5195 static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
5196
5197 XMVECTOR T2 = vmulq_f32(T,T);
5198 XMVECTOR T3 = vmulq_f32(T,T2);
5199 // Mul by the constants against t^2
5200 T2 = vmulq_f32(T2,CatMulT2);
5201 // Mul by the constants against t^3
5202 T3 = vmlaq_f32(T2, T3, CatMulT3 );
5203 // T3 now has the pre-result.
5204 // I need to add t.y only
5205 T2 = vandq_u32(T,g_XMMaskY);
5206 T3 = vaddq_f32(T3,T2);
5207 // Add 1.0f to x
5208 T3 = vaddq_f32(T3,g_XMIdentityR0);
5209 // Now, I have the constants created
5210 // Mul the x constant to Position0
5211 XMVECTOR vResult = vdupq_lane_f32( vget_low_f32( T3 ), 0 ); // T3[0]
5212 vResult = vmulq_f32(vResult,Position0);
5213 // Mul the y constant to Tangent0
5214 T2 = vdupq_lane_f32( vget_low_f32( T3 ), 1 ); // T3[1]
5215 vResult = vmlaq_f32(vResult, T2, Tangent0 );
5216 // Mul the z constant to Position1
5217 T2 = vdupq_lane_f32( vget_high_f32( T3 ), 0 ); // T3[2]
5218 vResult = vmlaq_f32(vResult, T2, Position1 );
5219 // Mul the w constant to Tangent1
5220 T3 = vdupq_lane_f32( vget_high_f32( T3 ), 1 ); // T3[3]
5221 vResult = vmlaq_f32(vResult, T3, Tangent1 );
5222 return vResult;
5223#elif defined(_XM_SSE_INTRINSICS_)
5224 static const XMVECTORF32 CatMulT2 = {-3.0f,-2.0f,3.0f,-1.0f};
5225 static const XMVECTORF32 CatMulT3 = {2.0f,1.0f,-2.0f,1.0f};
5226
5227 XMVECTOR T2 = _mm_mul_ps(T,T);
5228 XMVECTOR T3 = _mm_mul_ps(T,T2);
5229 // Mul by the constants against t^2
5230 T2 = _mm_mul_ps(T2,CatMulT2);
5231 // Mul by the constants against t^3
5232 T3 = _mm_mul_ps(T3,CatMulT3);
5233 // T3 now has the pre-result.
5234 T3 = _mm_add_ps(T3,T2);
5235 // I need to add t.y only
5236 T2 = _mm_and_ps(T,g_XMMaskY);
5237 T3 = _mm_add_ps(T3,T2);
5238 // Add 1.0f to x
5239 T3 = _mm_add_ps(T3,g_XMIdentityR0);
5240 // Now, I have the constants created
5241 // Mul the x constant to Position0
5242 XMVECTOR vResult = XM_PERMUTE_PS(T3,_MM_SHUFFLE(0,0,0,0));
5243 vResult = _mm_mul_ps(vResult,Position0);
5244 // Mul the y constant to Tangent0
5245 T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(1,1,1,1));
5246 T2 = _mm_mul_ps(T2,Tangent0);
5247 vResult = _mm_add_ps(vResult,T2);
5248 // Mul the z constant to Position1
5249 T2 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(2,2,2,2));
5250 T2 = _mm_mul_ps(T2,Position1);
5251 vResult = _mm_add_ps(vResult,T2);
5252 // Mul the w constant to Tangent1
5253 T3 = XM_PERMUTE_PS(T3,_MM_SHUFFLE(3,3,3,3));
5254 T3 = _mm_mul_ps(T3,Tangent1);
5255 vResult = _mm_add_ps(vResult,T3);
5256 return vResult;
5257#else // _XM_VMX128_INTRINSICS_
5258#endif // _XM_VMX128_INTRINSICS_
5259}
5260
5261//------------------------------------------------------------------------------
5262
5263inline XMVECTOR XMVectorCatmullRom
5264(
5265 FXMVECTOR Position0,
5266 FXMVECTOR Position1,
5267 FXMVECTOR Position2,
5268 GXMVECTOR Position3,
5269 float t
5270)
5271{
5272 // Result = ((-t^3 + 2 * t^2 - t) * Position0 +
5273 // (3 * t^3 - 5 * t^2 + 2) * Position1 +
5274 // (-3 * t^3 + 4 * t^2 + t) * Position2 +
5275 // (t^3 - t^2) * Position3) * 0.5
5276
5277#if defined(_XM_NO_INTRINSICS_)
5278
5279 float t2 = t * t;
5280 float t3 = t * t2;
5281
5282 XMVECTOR P0 = XMVectorReplicate((-t3 + 2.0f * t2 - t) * 0.5f);
5283 XMVECTOR P1 = XMVectorReplicate((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
5284 XMVECTOR P2 = XMVectorReplicate((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
5285 XMVECTOR P3 = XMVectorReplicate((t3 - t2) * 0.5f);
5286
5287 XMVECTOR Result = XMVectorMultiply(P0, Position0);
5288 Result = XMVectorMultiplyAdd(P1, Position1, Result);
5289 Result = XMVectorMultiplyAdd(P2, Position2, Result);
5290 Result = XMVectorMultiplyAdd(P3, Position3, Result);
5291
5292 return Result;
5293
5294#elif defined(_XM_ARM_NEON_INTRINSICS_)
5295 float t2 = t * t;
5296 float t3 = t * t2;
5297
5298 XMVECTOR P0 = vdupq_n_f32((-t3 + 2.0f * t2 - t) * 0.5f);
5299 XMVECTOR P1 = vdupq_n_f32((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
5300 XMVECTOR P2 = vdupq_n_f32((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
5301 XMVECTOR P3 = vdupq_n_f32((t3 - t2) * 0.5f);
5302
5303 P1 = vmulq_f32(P1, Position1);
5304 P0 = vmlaq_f32(P1, P0, Position0);
5305 P3 = vmulq_f32(P3, Position3);
5306 P2 = vmlaq_f32(P3, P2, Position2);
5307 P0 = vaddq_f32(P0,P2);
5308 return P0;
5309#elif defined(_XM_SSE_INTRINSICS_)
5310 float t2 = t * t;
5311 float t3 = t * t2;
5312
5313 XMVECTOR P0 = _mm_set_ps1((-t3 + 2.0f * t2 - t) * 0.5f);
5314 XMVECTOR P1 = _mm_set_ps1((3.0f * t3 - 5.0f * t2 + 2.0f) * 0.5f);
5315 XMVECTOR P2 = _mm_set_ps1((-3.0f * t3 + 4.0f * t2 + t) * 0.5f);
5316 XMVECTOR P3 = _mm_set_ps1((t3 - t2) * 0.5f);
5317
5318 P0 = _mm_mul_ps(P0, Position0);
5319 P1 = _mm_mul_ps(P1, Position1);
5320 P2 = _mm_mul_ps(P2, Position2);
5321 P3 = _mm_mul_ps(P3, Position3);
5322 P0 = _mm_add_ps(P0,P1);
5323 P2 = _mm_add_ps(P2,P3);
5324 P0 = _mm_add_ps(P0,P2);
5325 return P0;
5326#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
5327#endif // _XM_VMX128_INTRINSICS_
5328}
5329
5330//------------------------------------------------------------------------------
5331
5332inline XMVECTOR XMVectorCatmullRomV
5333(
5334 FXMVECTOR Position0,
5335 FXMVECTOR Position1,
5336 FXMVECTOR Position2,
5337 GXMVECTOR Position3,
5338 CXMVECTOR T
5339)
5340{
5341#if defined(_XM_NO_INTRINSICS_)
5342 float fx = T.vector4_f32[0];
5343 float fy = T.vector4_f32[1];
5344 float fz = T.vector4_f32[2];
5345 float fw = T.vector4_f32[3];
5346 XMVECTOR vResult = {
5347 0.5f*((-fx*fx*fx+2*fx*fx-fx)*Position0.vector4_f32[0]+
5348 (3*fx*fx*fx-5*fx*fx+2)*Position1.vector4_f32[0]+
5349 (-3*fx*fx*fx+4*fx*fx+fx)*Position2.vector4_f32[0]+
5350 (fx*fx*fx-fx*fx)*Position3.vector4_f32[0]),
5351 0.5f*((-fy*fy*fy+2*fy*fy-fy)*Position0.vector4_f32[1]+
5352 (3*fy*fy*fy-5*fy*fy+2)*Position1.vector4_f32[1]+
5353 (-3*fy*fy*fy+4*fy*fy+fy)*Position2.vector4_f32[1]+
5354 (fy*fy*fy-fy*fy)*Position3.vector4_f32[1]),
5355 0.5f*((-fz*fz*fz+2*fz*fz-fz)*Position0.vector4_f32[2]+
5356 (3*fz*fz*fz-5*fz*fz+2)*Position1.vector4_f32[2]+
5357 (-3*fz*fz*fz+4*fz*fz+fz)*Position2.vector4_f32[2]+
5358 (fz*fz*fz-fz*fz)*Position3.vector4_f32[2]),
5359 0.5f*((-fw*fw*fw+2*fw*fw-fw)*Position0.vector4_f32[3]+
5360 (3*fw*fw*fw-5*fw*fw+2)*Position1.vector4_f32[3]+
5361 (-3*fw*fw*fw+4*fw*fw+fw)*Position2.vector4_f32[3]+
5362 (fw*fw*fw-fw*fw)*Position3.vector4_f32[3])
5363 };
5364 return vResult;
5365#elif defined(_XM_ARM_NEON_INTRINSICS_)
5366 static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
5367 static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
5368 static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
5369 static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
5370 // Cache T^2 and T^3
5371 XMVECTOR T2 = vmulq_f32(T,T);
5372 XMVECTOR T3 = vmulq_f32(T,T2);
5373 // Perform the Position0 term
5374 XMVECTOR vResult = vaddq_f32(T2,T2);
5375 vResult = vsubq_f32(vResult,T);
5376 vResult = vsubq_f32(vResult,T3);
5377 vResult = vmulq_f32(vResult,Position0);
5378 // Perform the Position1 term and add
5379 XMVECTOR vTemp = vmulq_f32(T3,Catmul3);
5380 vTemp = vmlsq_f32(vTemp, T2, Catmul5);
5381 vTemp = vaddq_f32(vTemp,Catmul2);
5382 vResult = vmlaq_f32(vResult, vTemp, Position1);
5383 // Perform the Position2 term and add
5384 vTemp = vmulq_f32(T2,Catmul4);
5385 vTemp = vmlsq_f32(vTemp, T3, Catmul3);
5386 vTemp = vaddq_f32(vTemp,T);
5387 vResult = vmlaq_f32(vResult, vTemp, Position2);
5388 // Position3 is the last term
5389 T3 = vsubq_f32(T3,T2);
5390 vResult = vmlaq_f32(vResult, T3, Position3);
5391 // Multiply by 0.5f and exit
5392 vResult = vmulq_f32(vResult,g_XMOneHalf);
5393 return vResult;
5394#elif defined(_XM_SSE_INTRINSICS_)
5395 static const XMVECTORF32 Catmul2 = {2.0f,2.0f,2.0f,2.0f};
5396 static const XMVECTORF32 Catmul3 = {3.0f,3.0f,3.0f,3.0f};
5397 static const XMVECTORF32 Catmul4 = {4.0f,4.0f,4.0f,4.0f};
5398 static const XMVECTORF32 Catmul5 = {5.0f,5.0f,5.0f,5.0f};
5399 // Cache T^2 and T^3
5400 XMVECTOR T2 = _mm_mul_ps(T,T);
5401 XMVECTOR T3 = _mm_mul_ps(T,T2);
5402 // Perform the Position0 term
5403 XMVECTOR vResult = _mm_add_ps(T2,T2);
5404 vResult = _mm_sub_ps(vResult,T);
5405 vResult = _mm_sub_ps(vResult,T3);
5406 vResult = _mm_mul_ps(vResult,Position0);
5407 // Perform the Position1 term and add
5408 XMVECTOR vTemp = _mm_mul_ps(T3,Catmul3);
5409 XMVECTOR vTemp2 = _mm_mul_ps(T2,Catmul5);
5410 vTemp = _mm_sub_ps(vTemp,vTemp2);
5411 vTemp = _mm_add_ps(vTemp,Catmul2);
5412 vTemp = _mm_mul_ps(vTemp,Position1);
5413 vResult = _mm_add_ps(vResult,vTemp);
5414 // Perform the Position2 term and add
5415 vTemp = _mm_mul_ps(T2,Catmul4);
5416 vTemp2 = _mm_mul_ps(T3,Catmul3);
5417 vTemp = _mm_sub_ps(vTemp,vTemp2);
5418 vTemp = _mm_add_ps(vTemp,T);
5419 vTemp = _mm_mul_ps(vTemp,Position2);
5420 vResult = _mm_add_ps(vResult,vTemp);
5421 // Position3 is the last term
5422 T3 = _mm_sub_ps(T3,T2);
5423 T3 = _mm_mul_ps(T3,Position3);
5424 vResult = _mm_add_ps(vResult,T3);
5425 // Multiply by 0.5f and exit
5426 vResult = _mm_mul_ps(vResult,g_XMOneHalf);
5427 return vResult;
5428#else // _XM_VMX128_INTRINSICS_
5429#endif // _XM_VMX128_INTRINSICS_
5430}
5431
5432//------------------------------------------------------------------------------
5433
5434inline XMVECTOR XMVectorBaryCentric
5435(
5436 FXMVECTOR Position0,
5437 FXMVECTOR Position1,
5438 FXMVECTOR Position2,
5439 float f,
5440 float g
5441)
5442{
5443 // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
5444
5445#if defined(_XM_NO_INTRINSICS_)
5446
5447 XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
5448 XMVECTOR ScaleF = XMVectorReplicate(f);
5449
5450 XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
5451 XMVECTOR ScaleG = XMVectorReplicate(g);
5452
5453 XMVECTOR Result = XMVectorMultiplyAdd(P10, ScaleF, Position0);
5454 Result = XMVectorMultiplyAdd(P20, ScaleG, Result);
5455
5456 return Result;
5457
5458#elif defined(_XM_ARM_NEON_INTRINSICS_)
5459 XMVECTOR R1 = vsubq_f32(Position1,Position0);
5460 XMVECTOR SF = vdupq_n_f32(f);
5461 XMVECTOR R2 = vsubq_f32(Position2,Position0);
5462 XMVECTOR SG = vdupq_n_f32(g);
5463 R1 = vmlaq_f32( Position0, R1, SF);
5464 return vmlaq_f32( R1, R2, SG );
5465#elif defined(_XM_SSE_INTRINSICS_)
5466 XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
5467 XMVECTOR SF = _mm_set_ps1(f);
5468 XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
5469 XMVECTOR SG = _mm_set_ps1(g);
5470 R1 = _mm_mul_ps(R1,SF);
5471 R2 = _mm_mul_ps(R2,SG);
5472 R1 = _mm_add_ps(R1,Position0);
5473 R1 = _mm_add_ps(R1,R2);
5474 return R1;
5475#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
5476#endif // _XM_VMX128_INTRINSICS_
5477}
5478
5479//------------------------------------------------------------------------------
5480
5481inline XMVECTOR XMVectorBaryCentricV
5482(
5483 FXMVECTOR Position0,
5484 FXMVECTOR Position1,
5485 FXMVECTOR Position2,
5486 GXMVECTOR F,
5487 CXMVECTOR G
5488)
5489{
5490 // Result = Position0 + f * (Position1 - Position0) + g * (Position2 - Position0)
5491
5492#if defined(_XM_NO_INTRINSICS_)
5493
5494 XMVECTOR P10 = XMVectorSubtract(Position1, Position0);
5495 XMVECTOR P20 = XMVectorSubtract(Position2, Position0);
5496
5497 XMVECTOR Result = XMVectorMultiplyAdd(P10, F, Position0);
5498 Result = XMVectorMultiplyAdd(P20, G, Result);
5499
5500 return Result;
5501
5502#elif defined(_XM_ARM_NEON_INTRINSICS_)
5503 XMVECTOR R1 = vsubq_f32(Position1,Position0);
5504 XMVECTOR R2 = vsubq_f32(Position2,Position0);
5505 R1 = vmlaq_f32( Position0, R1, F );
5506 return vmlaq_f32( R1, R2, G);
5507#elif defined(_XM_SSE_INTRINSICS_)
5508 XMVECTOR R1 = _mm_sub_ps(Position1,Position0);
5509 XMVECTOR R2 = _mm_sub_ps(Position2,Position0);
5510 R1 = _mm_mul_ps(R1,F);
5511 R2 = _mm_mul_ps(R2,G);
5512 R1 = _mm_add_ps(R1,Position0);
5513 R1 = _mm_add_ps(R1,R2);
5514 return R1;
5515#else // _XM_VMX128_INTRINSICS_
5516#endif // _XM_VMX128_INTRINSICS_
5517}
5518
5519/****************************************************************************
5520 *
5521 * 2D Vector
5522 *
5523 ****************************************************************************/
5524
5525//------------------------------------------------------------------------------
5526// Comparison operations
5527//------------------------------------------------------------------------------
5528
5529//------------------------------------------------------------------------------
5530
5531inline bool XMVector2Equal
5532(
5533 FXMVECTOR V1,
5534 FXMVECTOR V2
5535)
5536{
5537#if defined(_XM_NO_INTRINSICS_)
5538 return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1])) != 0);
5539#elif defined(_XM_ARM_NEON_INTRINSICS_)
5540 __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
5541 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
5542#elif defined(_XM_SSE_INTRINSICS_)
5543 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
5544// z and w are don't care
5545 return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
5546#else // _XM_VMX128_INTRINSICS_
5547 return XMComparisonAllTrue(XMVector2EqualR(V1, V2));
5548#endif
5549}
5550
5551
5552//------------------------------------------------------------------------------
5553
5554inline uint32_t XMVector2EqualR
5555(
5556 FXMVECTOR V1,
5557 FXMVECTOR V2
5558)
5559{
5560#if defined(_XM_NO_INTRINSICS_)
5561
5562 uint32_t CR = 0;
5563 if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
5564 (V1.vector4_f32[1] == V2.vector4_f32[1]))
5565 {
5566 CR = XM_CRMASK_CR6TRUE;
5567 }
5568 else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
5569 (V1.vector4_f32[1] != V2.vector4_f32[1]))
5570 {
5571 CR = XM_CRMASK_CR6FALSE;
5572 }
5573 return CR;
5574
5575#elif defined(_XM_ARM_NEON_INTRINSICS_)
5576 __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
5577 uint64_t r = vget_lane_u64( vTemp, 0 );
5578 uint32_t CR = 0;
5579 if ( r == 0xFFFFFFFFFFFFFFFFU )
5580 {
5581 CR = XM_CRMASK_CR6TRUE;
5582 }
5583 else if ( !r )
5584 {
5585 CR = XM_CRMASK_CR6FALSE;
5586 }
5587 return CR;
5588#elif defined(_XM_SSE_INTRINSICS_)
5589 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
5590// z and w are don't care
5591 int iTest = _mm_movemask_ps(vTemp)&3;
5592 uint32_t CR = 0;
5593 if (iTest==3)
5594 {
5595 CR = XM_CRMASK_CR6TRUE;
5596 }
5597 else if (!iTest)
5598 {
5599 CR = XM_CRMASK_CR6FALSE;
5600 }
5601 return CR;
5602#else // _XM_VMX128_INTRINSICS_
5603#endif // _XM_VMX128_INTRINSICS_
5604}
5605
5606//------------------------------------------------------------------------------
5607
5608inline bool XMVector2EqualInt
5609(
5610 FXMVECTOR V1,
5611 FXMVECTOR V2
5612)
5613{
5614#if defined(_XM_NO_INTRINSICS_)
5615 return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1])) != 0);
5616#elif defined(_XM_ARM_NEON_INTRINSICS_)
5617 __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
5618 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
5619#elif defined(_XM_SSE_INTRINSICS_)
5620 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
5621 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)==3) != 0);
5622#else // _XM_VMX128_INTRINSICS_
5623 return XMComparisonAllTrue(XMVector2EqualIntR(V1, V2));
5624#endif
5625}
5626
5627//------------------------------------------------------------------------------
5628
5629inline uint32_t XMVector2EqualIntR
5630(
5631 FXMVECTOR V1,
5632 FXMVECTOR V2
5633)
5634{
5635#if defined(_XM_NO_INTRINSICS_)
5636
5637 uint32_t CR = 0;
5638 if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
5639 (V1.vector4_u32[1] == V2.vector4_u32[1]))
5640 {
5641 CR = XM_CRMASK_CR6TRUE;
5642 }
5643 else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
5644 (V1.vector4_u32[1] != V2.vector4_u32[1]))
5645 {
5646 CR = XM_CRMASK_CR6FALSE;
5647 }
5648 return CR;
5649
5650#elif defined(_XM_ARM_NEON_INTRINSICS_)
5651 __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
5652 uint64_t r = vget_lane_u64( vTemp, 0 );
5653 uint32_t CR = 0;
5654 if ( r == 0xFFFFFFFFFFFFFFFFU )
5655 {
5656 CR = XM_CRMASK_CR6TRUE;
5657 }
5658 else if ( !r )
5659 {
5660 CR = XM_CRMASK_CR6FALSE;
5661 }
5662 return CR;
5663#elif defined(_XM_SSE_INTRINSICS_)
5664 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
5665 int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&3;
5666 uint32_t CR = 0;
5667 if (iTest==3)
5668 {
5669 CR = XM_CRMASK_CR6TRUE;
5670 }
5671 else if (!iTest)
5672 {
5673 CR = XM_CRMASK_CR6FALSE;
5674 }
5675 return CR;
5676#else // _XM_VMX128_INTRINSICS_
5677#endif // _XM_VMX128_INTRINSICS_
5678}
5679
5680//------------------------------------------------------------------------------
5681
5682inline bool XMVector2NearEqual
5683(
5684 FXMVECTOR V1,
5685 FXMVECTOR V2,
5686 FXMVECTOR Epsilon
5687)
5688{
5689#if defined(_XM_NO_INTRINSICS_)
5690 float dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
5691 float dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
5692 return ((dx <= Epsilon.vector4_f32[0]) &&
5693 (dy <= Epsilon.vector4_f32[1]));
5694#elif defined(_XM_ARM_NEON_INTRINSICS_)
5695 __n64 vDelta = vsub_f32(vget_low_u32(V1), vget_low_u32(V2));
5696 __n64 vTemp = vacle_f32( vDelta, vget_low_u32(Epsilon) );
5697 uint64_t r = vget_lane_u64( vTemp, 0 );
5698 return ( r == 0xFFFFFFFFFFFFFFFFU );
5699#elif defined(_XM_SSE_INTRINSICS_)
5700 // Get the difference
5701 XMVECTOR vDelta = _mm_sub_ps(V1,V2);
5702 // Get the absolute value of the difference
5703 XMVECTOR vTemp = _mm_setzero_ps();
5704 vTemp = _mm_sub_ps(vTemp,vDelta);
5705 vTemp = _mm_max_ps(vTemp,vDelta);
5706 vTemp = _mm_cmple_ps(vTemp,Epsilon);
5707 // z and w are don't care
5708 return (((_mm_movemask_ps(vTemp)&3)==0x3) != 0);
5709#else // _XM_VMX128_INTRINSICS_
5710#endif // _XM_VMX128_INTRINSICS_
5711}
5712
5713//------------------------------------------------------------------------------
5714
5715inline bool XMVector2NotEqual
5716(
5717 FXMVECTOR V1,
5718 FXMVECTOR V2
5719)
5720{
5721#if defined(_XM_NO_INTRINSICS_)
5722 return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1])) != 0);
5723#elif defined(_XM_ARM_NEON_INTRINSICS_)
5724 __n64 vTemp = vceq_f32( vget_low_f32(V1), vget_low_f32(V2) );
5725 return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
5726#elif defined(_XM_SSE_INTRINSICS_)
5727 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
5728// z and w are don't care
5729 return (((_mm_movemask_ps(vTemp)&3)!=3) != 0);
5730#else // _XM_VMX128_INTRINSICS_
5731 return XMComparisonAnyFalse(XMVector2EqualR(V1, V2));
5732#endif
5733}
5734
5735//------------------------------------------------------------------------------
5736
5737inline bool XMVector2NotEqualInt
5738(
5739 FXMVECTOR V1,
5740 FXMVECTOR V2
5741)
5742{
5743#if defined(_XM_NO_INTRINSICS_)
5744 return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1])) != 0);
5745#elif defined(_XM_ARM_NEON_INTRINSICS_)
5746 __n64 vTemp = vceq_u32( vget_low_u32(V1), vget_low_u32(V2) );
5747 return ( vget_lane_u64( vTemp, 0 ) != 0xFFFFFFFFFFFFFFFFU );
5748#elif defined(_XM_SSE_INTRINSICS_)
5749 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
5750 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&3)!=3) != 0);
5751#else // _XM_VMX128_INTRINSICS_
5752 return XMComparisonAnyFalse(XMVector2EqualIntR(V1, V2));
5753#endif
5754}
5755
5756//------------------------------------------------------------------------------
5757
5758inline bool XMVector2Greater
5759(
5760 FXMVECTOR V1,
5761 FXMVECTOR V2
5762)
5763{
5764#if defined(_XM_NO_INTRINSICS_)
5765 return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1])) != 0);
5766#elif defined(_XM_ARM_NEON_INTRINSICS_)
5767 __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
5768 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
5769#elif defined(_XM_SSE_INTRINSICS_)
5770 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
5771// z and w are don't care
5772 return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
5773#else // _XM_VMX128_INTRINSICS_
5774 return XMComparisonAllTrue(XMVector2GreaterR(V1, V2));
5775#endif
5776}
5777
5778//------------------------------------------------------------------------------
5779
5780inline uint32_t XMVector2GreaterR
5781(
5782 FXMVECTOR V1,
5783 FXMVECTOR V2
5784)
5785{
5786#if defined(_XM_NO_INTRINSICS_)
5787
5788 uint32_t CR = 0;
5789 if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
5790 (V1.vector4_f32[1] > V2.vector4_f32[1]))
5791 {
5792 CR = XM_CRMASK_CR6TRUE;
5793 }
5794 else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
5795 (V1.vector4_f32[1] <= V2.vector4_f32[1]))
5796 {
5797 CR = XM_CRMASK_CR6FALSE;
5798 }
5799 return CR;
5800
5801#elif defined(_XM_ARM_NEON_INTRINSICS_)
5802 __n64 vTemp = vcgt_f32( vget_low_f32(V1), vget_low_f32(V2) );
5803 uint64_t r = vget_lane_u64( vTemp, 0 );
5804 uint32_t CR = 0;
5805 if ( r == 0xFFFFFFFFFFFFFFFFU )
5806 {
5807 CR = XM_CRMASK_CR6TRUE;
5808 }
5809 else if ( !r )
5810 {
5811 CR = XM_CRMASK_CR6FALSE;
5812 }
5813 return CR;
5814#elif defined(_XM_SSE_INTRINSICS_)
5815 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
5816 int iTest = _mm_movemask_ps(vTemp)&3;
5817 uint32_t CR = 0;
5818 if (iTest==3)
5819 {
5820 CR = XM_CRMASK_CR6TRUE;
5821 }
5822 else if (!iTest)
5823 {
5824 CR = XM_CRMASK_CR6FALSE;
5825 }
5826 return CR;
5827#else // _XM_VMX128_INTRINSICS_
5828#endif // _XM_VMX128_INTRINSICS_
5829}
5830
5831//------------------------------------------------------------------------------
5832
5833inline bool XMVector2GreaterOrEqual
5834(
5835 FXMVECTOR V1,
5836 FXMVECTOR V2
5837)
5838{
5839#if defined(_XM_NO_INTRINSICS_)
5840 return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1])) != 0);
5841#elif defined(_XM_ARM_NEON_INTRINSICS_)
5842 __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
5843 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
5844#elif defined(_XM_SSE_INTRINSICS_)
5845 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
5846 return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
5847#else // _XM_VMX128_INTRINSICS_
5848 return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V1, V2));
5849#endif
5850}
5851
5852//------------------------------------------------------------------------------
5853
5854inline uint32_t XMVector2GreaterOrEqualR
5855(
5856 FXMVECTOR V1,
5857 FXMVECTOR V2
5858)
5859{
5860#if defined(_XM_NO_INTRINSICS_)
5861
5862 uint32_t CR = 0;
5863 if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
5864 (V1.vector4_f32[1] >= V2.vector4_f32[1]))
5865 {
5866 CR = XM_CRMASK_CR6TRUE;
5867 }
5868 else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
5869 (V1.vector4_f32[1] < V2.vector4_f32[1]))
5870 {
5871 CR = XM_CRMASK_CR6FALSE;
5872 }
5873 return CR;
5874
5875#elif defined(_XM_ARM_NEON_INTRINSICS_)
5876 __n64 vTemp = vcge_f32( vget_low_f32(V1), vget_low_f32(V2) );
5877 uint64_t r = vget_lane_u64( vTemp, 0 );
5878 uint32_t CR = 0;
5879 if ( r == 0xFFFFFFFFFFFFFFFFU )
5880 {
5881 CR = XM_CRMASK_CR6TRUE;
5882 }
5883 else if ( !r )
5884 {
5885 CR = XM_CRMASK_CR6FALSE;
5886 }
5887 return CR;
5888#elif defined(_XM_SSE_INTRINSICS_)
5889 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
5890 int iTest = _mm_movemask_ps(vTemp)&3;
5891 uint32_t CR = 0;
5892 if (iTest == 3)
5893 {
5894 CR = XM_CRMASK_CR6TRUE;
5895 }
5896 else if (!iTest)
5897 {
5898 CR = XM_CRMASK_CR6FALSE;
5899 }
5900 return CR;
5901#else // _XM_VMX128_INTRINSICS_
5902#endif // _XM_VMX128_INTRINSICS_
5903}
5904
5905//------------------------------------------------------------------------------
5906
5907inline bool XMVector2Less
5908(
5909 FXMVECTOR V1,
5910 FXMVECTOR V2
5911)
5912{
5913#if defined(_XM_NO_INTRINSICS_)
5914 return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1])) != 0);
5915#elif defined(_XM_ARM_NEON_INTRINSICS_)
5916 __n64 vTemp = vclt_f32( vget_low_f32(V1), vget_low_f32(V2) );
5917 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
5918#elif defined(_XM_SSE_INTRINSICS_)
5919 XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
5920 return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
5921#else // _XM_VMX128_INTRINSICS_
5922 return XMComparisonAllTrue(XMVector2GreaterR(V2, V1));
5923#endif
5924}
5925
5926//------------------------------------------------------------------------------
5927
5928inline bool XMVector2LessOrEqual
5929(
5930 FXMVECTOR V1,
5931 FXMVECTOR V2
5932)
5933{
5934#if defined(_XM_NO_INTRINSICS_)
5935 return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1])) != 0);
5936#elif defined(_XM_ARM_NEON_INTRINSICS_)
5937 __n64 vTemp = vcle_f32( vget_low_f32(V1), vget_low_f32(V2) );
5938 return ( vget_lane_u64( vTemp, 0 ) == 0xFFFFFFFFFFFFFFFFU );
5939#elif defined(_XM_SSE_INTRINSICS_)
5940 XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
5941 return (((_mm_movemask_ps(vTemp)&3)==3) != 0);
5942#else // _XM_VMX128_INTRINSICS_
5943 return XMComparisonAllTrue(XMVector2GreaterOrEqualR(V2, V1));
5944#endif
5945}
5946
5947//------------------------------------------------------------------------------
5948
5949inline bool XMVector2InBounds
5950(
5951 FXMVECTOR V,
5952 FXMVECTOR Bounds
5953)
5954{
5955#if defined(_XM_NO_INTRINSICS_)
5956 return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
5957 (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1])) != 0);
5958#elif defined(_XM_ARM_NEON_INTRINSICS_)
5959 __n64 VL = vget_low_f32( V );
5960 __n64 B = vget_low_f32( Bounds );
5961 // Test if less than or equal
5962 __n64 vTemp1 = vcle_f32(VL,B);
5963 // Negate the bounds
5964 __n64 vTemp2 = vneg_f32(B);
5965 // Test if greater or equal (Reversed)
5966 vTemp2 = vcle_f32(vTemp2,VL);
5967 // Blend answers
5968 vTemp1 = vand_u32(vTemp1,vTemp2);
5969 // x and y in bounds?
5970 return ( vget_lane_u64( vTemp1, 0 ) == 0xFFFFFFFFFFFFFFFFU );
5971#elif defined(_XM_SSE_INTRINSICS_)
5972 // Test if less than or equal
5973 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
5974 // Negate the bounds
5975 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
5976 // Test if greater or equal (Reversed)
5977 vTemp2 = _mm_cmple_ps(vTemp2,V);
5978 // Blend answers
5979 vTemp1 = _mm_and_ps(vTemp1,vTemp2);
5980 // x and y in bounds? (z and w are don't care)
5981 return (((_mm_movemask_ps(vTemp1)&0x3)==0x3) != 0);
5982#else // _XM_VMX128_INTRINSICS_
5983 return XMComparisonAllInBounds(XMVector2InBoundsR(V, Bounds));
5984#endif
5985}
5986
5987
5988//------------------------------------------------------------------------------
5989
5990inline bool XMVector2IsNaN
5991(
5992 FXMVECTOR V
5993)
5994{
5995#if defined(_XM_NO_INTRINSICS_)
5996 return (XMISNAN(V.vector4_f32[0]) ||
5997 XMISNAN(V.vector4_f32[1]));
5998#elif defined(_XM_ARM_NEON_INTRINSICS_)
5999 __n64 VL = vget_low_f32( V );
6000 // Test against itself. NaN is always not equal
6001 __n64 vTempNan = vceq_f32( VL, VL );
6002 // If x or y are NaN, the mask is zero
6003 return ( vget_lane_u64( vTempNan, 0 ) != 0xFFFFFFFFFFFFFFFFU );
6004#elif defined(_XM_SSE_INTRINSICS_)
6005 // Test against itself. NaN is always not equal
6006 XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
6007 // If x or y are NaN, the mask is non-zero
6008 return ((_mm_movemask_ps(vTempNan)&3) != 0);
6009#else // _XM_VMX128_INTRINSICS_
6010#endif // _XM_VMX128_INTRINSICS_
6011}
6012
6013//------------------------------------------------------------------------------
6014
6015inline bool XMVector2IsInfinite
6016(
6017 FXMVECTOR V
6018)
6019{
6020#if defined(_XM_NO_INTRINSICS_)
6021
6022 return (XMISINF(V.vector4_f32[0]) ||
6023 XMISINF(V.vector4_f32[1]));
6024#elif defined(_XM_ARM_NEON_INTRINSICS_)
6025 // Mask off the sign bit
6026 __n64 vTemp = vand_u32( vget_low_f32( V ) , vget_low_f32( g_XMAbsMask ) );
6027 // Compare to infinity
6028 vTemp = vceq_f32(vTemp, vget_low_f32( g_XMInfinity) );
6029 // If any are infinity, the signs are true.
6030 return vget_lane_u64( vTemp, 0 ) != 0;
6031#elif defined(_XM_SSE_INTRINSICS_)
6032 // Mask off the sign bit
6033 __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
6034 // Compare to infinity
6035 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
6036 // If x or z are infinity, the signs are true.
6037 return ((_mm_movemask_ps(vTemp)&3) != 0);
6038#else // _XM_VMX128_INTRINSICS_
6039#endif // _XM_VMX128_INTRINSICS_
6040}
6041
6042//------------------------------------------------------------------------------
6043// Computation operations
6044//------------------------------------------------------------------------------
6045
6046//------------------------------------------------------------------------------
6047
6048inline XMVECTOR XMVector2Dot
6049(
6050 FXMVECTOR V1,
6051 FXMVECTOR V2
6052)
6053{
6054#if defined(_XM_NO_INTRINSICS_)
6055
6056 XMVECTOR Result;
6057 Result.vector4_f32[0] =
6058 Result.vector4_f32[1] =
6059 Result.vector4_f32[2] =
6060 Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1];
6061 return Result;
6062
6063#elif defined(_XM_ARM_NEON_INTRINSICS_)
6064 // Perform the dot product on x and y
6065 __n64 vTemp = vmul_f32( vget_low_f32(V1), vget_low_f32(V2) );
6066 vTemp = vpadd_f32( vTemp, vTemp );
6067 return vcombine_f32( vTemp, vTemp );
6068#elif defined(_XM_SSE_INTRINSICS_)
6069 // Perform the dot product on x and y
6070 XMVECTOR vLengthSq = _mm_mul_ps(V1,V2);
6071 // vTemp has y splatted
6072 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
6073 // x+y
6074 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
6075 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
6076 return vLengthSq;
6077#else // _XM_VMX128_INTRINSICS_
6078#endif // _XM_VMX128_INTRINSICS_
6079}
6080
6081//------------------------------------------------------------------------------
6082
6083inline XMVECTOR XMVector2Cross
6084(
6085 FXMVECTOR V1,
6086 FXMVECTOR V2
6087)
6088{
6089 // [ V1.x*V2.y - V1.y*V2.x, V1.x*V2.y - V1.y*V2.x ]
6090
6091#if defined(_XM_NO_INTRINSICS_)
6092 float fCross = (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]);
6093 XMVECTOR vResult = {
6094 fCross,
6095 fCross,
6096 fCross,
6097 fCross
6098 };
6099 return vResult;
6100#elif defined(_XM_ARM_NEON_INTRINSICS_)
6101 static const XMVECTORF32 Negate = { 1.f, -1.f, 0, 0 };
6102
6103 __n64 vTemp = vmul_f32( vget_low_f32( V1 ), vrev64_f32( vget_low_f32( V2 ) ) );
6104 vTemp = vmul_f32( vTemp, vget_low_f32( Negate ) );
6105 vTemp = vpadd_f32( vTemp, vTemp );
6106 return vcombine_f32( vTemp, vTemp );
6107#elif defined(_XM_SSE_INTRINSICS_)
6108 // Swap x and y
6109 XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(0,1,0,1));
6110 // Perform the muls
6111 vResult = _mm_mul_ps(vResult,V1);
6112 // Splat y
6113 XMVECTOR vTemp = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(1,1,1,1));
6114 // Sub the values
6115 vResult = _mm_sub_ss(vResult,vTemp);
6116 // Splat the cross product
6117 vResult = XM_PERMUTE_PS(vResult,_MM_SHUFFLE(0,0,0,0));
6118 return vResult;
6119#else // _XM_VMX128_INTRINSICS_
6120#endif // _XM_VMX128_INTRINSICS_
6121}
6122
6123//------------------------------------------------------------------------------
6124
6125inline XMVECTOR XMVector2LengthSq
6126(
6127 FXMVECTOR V
6128)
6129{
6130 return XMVector2Dot(V, V);
6131}
6132
6133//------------------------------------------------------------------------------
6134
6135inline XMVECTOR XMVector2ReciprocalLengthEst
6136(
6137 FXMVECTOR V
6138)
6139{
6140#if defined(_XM_NO_INTRINSICS_)
6141
6142 XMVECTOR Result;
6143 Result = XMVector2LengthSq(V);
6144 Result = XMVectorReciprocalSqrtEst(Result);
6145 return Result;
6146
6147#elif defined(_XM_ARM_NEON_INTRINSICS_)
6148 __n64 VL = vget_low_f32(V);
6149 // Dot2
6150 __n64 vTemp = vmul_f32( VL, VL );
6151 vTemp = vpadd_f32( vTemp, vTemp );
6152 // Reciprocal sqrt (estimate)
6153 vTemp = vrsqrte_f32( vTemp );
6154 return vcombine_f32( vTemp, vTemp );
6155#elif defined(_XM_SSE_INTRINSICS_)
6156 // Perform the dot product on x and y
6157 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
6158 // vTemp has y splatted
6159 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
6160 // x+y
6161 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
6162 vLengthSq = _mm_rsqrt_ss(vLengthSq);
6163 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
6164 return vLengthSq;
6165#else // _XM_VMX128_INTRINSICS_
6166#endif // _XM_VMX128_INTRINSICS_
6167}
6168
6169//------------------------------------------------------------------------------
6170
6171inline XMVECTOR XMVector2ReciprocalLength
6172(
6173 FXMVECTOR V
6174)
6175{
6176#if defined(_XM_NO_INTRINSICS_)
6177
6178 XMVECTOR Result;
6179 Result = XMVector2LengthSq(V);
6180 Result = XMVectorReciprocalSqrt(Result);
6181 return Result;
6182
6183#elif defined(_XM_ARM_NEON_INTRINSICS_)
6184 __n64 VL = vget_low_f32(V);
6185 // Dot2
6186 __n64 vTemp = vmul_f32( VL, VL );
6187 vTemp = vpadd_f32( vTemp, vTemp );
6188 // Reciprocal sqrt
6189 __n64 S0 = vrsqrte_f32(vTemp);
6190 __n64 P0 = vmul_f32( vTemp, S0 );
6191 __n64 R0 = vrsqrts_f32( P0, S0 );
6192 __n64 S1 = vmul_f32( S0, R0 );
6193 __n64 P1 = vmul_f32( vTemp, S1 );
6194 __n64 R1 = vrsqrts_f32( P1, S1 );
6195 __n64 Result = vmul_f32( S1, R1 );
6196 return vcombine_f32( Result, Result );
6197#elif defined(_XM_SSE_INTRINSICS_)
6198 // Perform the dot product on x and y
6199 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
6200 // vTemp has y splatted
6201 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
6202 // x+y
6203 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
6204 vLengthSq = _mm_sqrt_ss(vLengthSq);
6205 vLengthSq = _mm_div_ss(g_XMOne,vLengthSq);
6206 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
6207 return vLengthSq;
6208#else // _XM_VMX128_INTRINSICS_
6209#endif // _XM_VMX128_INTRINSICS_
6210}
6211
6212//------------------------------------------------------------------------------
6213
6214inline XMVECTOR XMVector2LengthEst
6215(
6216 FXMVECTOR V
6217)
6218{
6219#if defined(_XM_NO_INTRINSICS_)
6220
6221 XMVECTOR Result;
6222 Result = XMVector2LengthSq(V);
6223 Result = XMVectorSqrtEst(Result);
6224 return Result;
6225
6226#elif defined(_XM_ARM_NEON_INTRINSICS_)
6227 __n64 VL = vget_low_f32(V);
6228 // Dot2
6229 __n64 vTemp = vmul_f32( VL, VL );
6230 vTemp = vpadd_f32( vTemp, vTemp );
6231 const __n64 zero = vdup_n_u32(0);
6232 __n64 VEqualsZero = vceq_f32( vTemp, zero );
6233 // Sqrt (estimate)
6234 __n64 Result = vrsqrte_f32( vTemp );
6235 Result = vmul_f32( vTemp, Result );
6236 Result = vbsl_f32( VEqualsZero, zero, Result );
6237 return vcombine_f32( Result, Result );
6238#elif defined(_XM_SSE_INTRINSICS_)
6239 // Perform the dot product on x and y
6240 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
6241 // vTemp has y splatted
6242 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
6243 // x+y
6244 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
6245 vLengthSq = _mm_sqrt_ss(vLengthSq);
6246 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
6247 return vLengthSq;
6248#else // _XM_VMX128_INTRINSICS_
6249#endif // _XM_VMX128_INTRINSICS_
6250}
6251
6252//------------------------------------------------------------------------------
6253
6254inline XMVECTOR XMVector2Length
6255(
6256 FXMVECTOR V
6257)
6258{
6259#if defined(_XM_NO_INTRINSICS_)
6260
6261 XMVECTOR Result;
6262 Result = XMVector2LengthSq(V);
6263 Result = XMVectorSqrt(Result);
6264 return Result;
6265
6266#elif defined(_XM_ARM_NEON_INTRINSICS_)
6267 __n64 VL = vget_low_f32(V);
6268 // Dot2
6269 __n64 vTemp = vmul_f32( VL, VL );
6270 vTemp = vpadd_f32( vTemp, vTemp );
6271 const __n64 zero = vdup_n_u32(0);
6272 __n64 VEqualsZero = vceq_f32( vTemp, zero );
6273 // Sqrt
6274 __n64 S0 = vrsqrte_f32( vTemp );
6275 __n64 P0 = vmul_f32( vTemp, S0 );
6276 __n64 R0 = vrsqrts_f32( P0, S0 );
6277 __n64 S1 = vmul_f32( S0, R0 );
6278 __n64 P1 = vmul_f32( vTemp, S1 );
6279 __n64 R1 = vrsqrts_f32( P1, S1 );
6280 __n64 Result = vmul_f32( S1, R1 );
6281 Result = vmul_f32( vTemp, Result );
6282 Result = vbsl_f32( VEqualsZero, zero, Result );
6283 return vcombine_f32( Result, Result );
6284#elif defined(_XM_SSE_INTRINSICS_)
6285 // Perform the dot product on x and y
6286 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
6287 // vTemp has y splatted
6288 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
6289 // x+y
6290 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
6291 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
6292 vLengthSq = _mm_sqrt_ps(vLengthSq);
6293 return vLengthSq;
6294#else // _XM_VMX128_INTRINSICS_
6295#endif // _XM_VMX128_INTRINSICS_
6296}
6297
6298//------------------------------------------------------------------------------
6299// XMVector2NormalizeEst uses a reciprocal estimate and
6300// returns QNaN on zero and infinite vectors.
6301
6302inline XMVECTOR XMVector2NormalizeEst
6303(
6304 FXMVECTOR V
6305)
6306{
6307#if defined(_XM_NO_INTRINSICS_)
6308
6309 XMVECTOR Result;
6310 Result = XMVector2ReciprocalLength(V);
6311 Result = XMVectorMultiply(V, Result);
6312 return Result;
6313
6314#elif defined(_XM_ARM_NEON_INTRINSICS_)
6315 __n64 VL = vget_low_f32(V);
6316 // Dot2
6317 __n64 vTemp = vmul_f32( VL, VL );
6318 vTemp = vpadd_f32( vTemp, vTemp );
6319 // Reciprocal sqrt (estimate)
6320 vTemp = vrsqrte_f32( vTemp );
6321 // Normalize
6322 __n64 Result = vmul_f32( VL, vTemp );
6323 return vcombine_f32( Result, Result );
6324#elif defined(_XM_SSE_INTRINSICS_)
6325 // Perform the dot product on x and y
6326 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
6327 // vTemp has y splatted
6328 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
6329 // x+y
6330 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
6331 vLengthSq = _mm_rsqrt_ss(vLengthSq);
6332 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
6333 vLengthSq = _mm_mul_ps(vLengthSq,V);
6334 return vLengthSq;
6335#else // _XM_VMX128_INTRINSICS_
6336#endif // _XM_VMX128_INTRINSICS_
6337}
6338
6339//------------------------------------------------------------------------------
6340
6341inline XMVECTOR XMVector2Normalize
6342(
6343 FXMVECTOR V
6344)
6345{
6346#if defined(_XM_NO_INTRINSICS_)
6347
6348 XMVECTOR vResult = XMVector2Length( V );
6349 float fLength = vResult.vector4_f32[0];
6350
6351 // Prevent divide by zero
6352 if (fLength > 0) {
6353 fLength = 1.0f/fLength;
6354 }
6355
6356 vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
6357 vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
6358 vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
6359 vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
6360 return vResult;
6361
6362#elif defined(_XM_ARM_NEON_INTRINSICS_)
6363 __n64 VL = vget_low_f32(V);
6364 // Dot2
6365 __n64 vTemp = vmul_f32( VL, VL );
6366 vTemp = vpadd_f32( vTemp, vTemp );
6367 __n64 VEqualsZero = vceq_f32( vTemp, vdup_n_u32(0) );
6368 __n64 VEqualsInf = vceq_f32( vTemp, vget_low_f32(g_XMInfinity) );
6369 // Reciprocal sqrt (2 iterations of Newton-Raphson)
6370 __n64 S0 = vrsqrte_f32( vTemp );
6371 __n64 P0 = vmul_f32( vTemp, S0 );
6372 __n64 R0 = vrsqrts_f32( P0, S0 );
6373 __n64 S1 = vmul_f32( S0, R0 );
6374 __n64 P1 = vmul_f32( vTemp, S1 );
6375 __n64 R1 = vrsqrts_f32( P1, S1 );
6376 vTemp = vmul_f32( S1, R1 );
6377 // Normalize
6378 __n64 Result = vmul_f32( VL, vTemp );
6379 Result = vbsl_f32( VEqualsZero, vdup_n_f32(0), Result );
6380 Result = vbsl_f32( VEqualsInf, vget_low_f32(g_XMQNaN), Result );
6381 return vcombine_f32( Result, Result );
6382#elif defined(_XM_SSE_INTRINSICS_)
6383 // Perform the dot product on x and y only
6384 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
6385 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,1,1,1));
6386 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
6387 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
6388 // Prepare for the division
6389 XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
6390 // Create zero with a single instruction
6391 XMVECTOR vZeroMask = _mm_setzero_ps();
6392 // Test for a divide by zero (Must be FP to detect -0.0)
6393 vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
6394 // Failsafe on zero (Or epsilon) length planes
6395 // If the length is infinity, set the elements to zero
6396 vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
6397 // Reciprocal mul to perform the normalization
6398 vResult = _mm_div_ps(V,vResult);
6399 // Any that are infinity, set to zero
6400 vResult = _mm_and_ps(vResult,vZeroMask);
6401 // Select qnan or result based on infinite length
6402 XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
6403 XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
6404 vResult = _mm_or_ps(vTemp1,vTemp2);
6405 return vResult;
6406#else // _XM_VMX128_INTRINSICS_
6407#endif // _XM_VMX128_INTRINSICS_
6408}
6409
6410//------------------------------------------------------------------------------
6411
6412inline XMVECTOR XMVector2ClampLength
6413(
6414 FXMVECTOR V,
6415 float LengthMin,
6416 float LengthMax
6417)
6418{
6419 XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
6420 XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
6421 return XMVector2ClampLengthV(V, ClampMin, ClampMax);
6422}
6423
6424//------------------------------------------------------------------------------
6425
6426inline XMVECTOR XMVector2ClampLengthV
6427(
6428 FXMVECTOR V,
6429 FXMVECTOR LengthMin,
6430 FXMVECTOR LengthMax
6431)
6432{
6433 assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)));
6434 assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)));
6435 assert(XMVector2GreaterOrEqual(LengthMin, g_XMZero));
6436 assert(XMVector2GreaterOrEqual(LengthMax, g_XMZero));
6437 assert(XMVector2GreaterOrEqual(LengthMax, LengthMin));
6438
6439 XMVECTOR LengthSq = XMVector2LengthSq(V);
6440
6441 const XMVECTOR Zero = XMVectorZero();
6442
6443 XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
6444
6445 XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
6446 XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
6447
6448 XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
6449
6450 XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
6451
6452 XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
6453 Length = XMVectorSelect(LengthSq, Length, Select);
6454 Normal = XMVectorSelect(LengthSq, Normal, Select);
6455
6456 XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
6457 XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
6458
6459 XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
6460 ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
6461
6462 XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
6463
6464 // Preserve the original vector (with no precision loss) if the length falls within the given range
6465 XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
6466 Result = XMVectorSelect(Result, V, Control);
6467
6468 return Result;
6469}
6470
6471//------------------------------------------------------------------------------
6472
6473inline XMVECTOR XMVector2Reflect
6474(
6475 FXMVECTOR Incident,
6476 FXMVECTOR Normal
6477)
6478{
6479 // Result = Incident - (2 * dot(Incident, Normal)) * Normal
6480
6481 XMVECTOR Result;
6482 Result = XMVector2Dot(Incident, Normal);
6483 Result = XMVectorAdd(Result, Result);
6484 Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
6485 return Result;
6486}
6487
6488//------------------------------------------------------------------------------
6489
6490inline XMVECTOR XMVector2Refract
6491(
6492 FXMVECTOR Incident,
6493 FXMVECTOR Normal,
6494 float RefractionIndex
6495)
6496{
6497 XMVECTOR Index = XMVectorReplicate(RefractionIndex);
6498 return XMVector2RefractV(Incident, Normal, Index);
6499}
6500
6501//------------------------------------------------------------------------------
6502
6503// Return the refraction of a 2D vector
6504inline XMVECTOR XMVector2RefractV
6505(
6506 FXMVECTOR Incident,
6507 FXMVECTOR Normal,
6508 FXMVECTOR RefractionIndex
6509)
6510{
6511 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
6512 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
6513
6514#if defined(_XM_NO_INTRINSICS_)
6515
6516 float IDotN = (Incident.vector4_f32[0]*Normal.vector4_f32[0])+(Incident.vector4_f32[1]*Normal.vector4_f32[1]);
6517 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
6518 float RY = 1.0f-(IDotN*IDotN);
6519 float RX = 1.0f-(RY*RefractionIndex.vector4_f32[0]*RefractionIndex.vector4_f32[0]);
6520 RY = 1.0f-(RY*RefractionIndex.vector4_f32[1]*RefractionIndex.vector4_f32[1]);
6521 if (RX>=0.0f) {
6522 RX = (RefractionIndex.vector4_f32[0]*Incident.vector4_f32[0])-(Normal.vector4_f32[0]*((RefractionIndex.vector4_f32[0]*IDotN)+sqrtf(RX)));
6523 } else {
6524 RX = 0.0f;
6525 }
6526 if (RY>=0.0f) {
6527 RY = (RefractionIndex.vector4_f32[1]*Incident.vector4_f32[1])-(Normal.vector4_f32[1]*((RefractionIndex.vector4_f32[1]*IDotN)+sqrtf(RY)));
6528 } else {
6529 RY = 0.0f;
6530 }
6531
6532 XMVECTOR vResult;
6533 vResult.vector4_f32[0] = RX;
6534 vResult.vector4_f32[1] = RY;
6535 vResult.vector4_f32[2] = 0.0f;
6536 vResult.vector4_f32[3] = 0.0f;
6537 return vResult;
6538
6539#elif defined(_XM_ARM_NEON_INTRINSICS_)
6540 __n64 IL = vget_low_f32( Incident );
6541 __n64 NL = vget_low_f32( Normal );
6542 __n64 RIL = vget_low_f32( RefractionIndex );
6543 // Get the 2D Dot product of Incident-Normal
6544 __n64 vTemp = vmul_f32(IL, NL);
6545 __n64 IDotN = vpadd_f32( vTemp, vTemp );
6546 // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
6547 vTemp = vmls_f32( vget_low_f32( g_XMOne ), IDotN, IDotN);
6548 vTemp = vmul_f32(vTemp,RIL);
6549 vTemp = vmls_f32(vget_low_f32( g_XMOne ), vTemp, RIL );
6550 // If any terms are <=0, sqrt() will fail, punt to zero
6551 __n64 vMask = vcgt_f32(vTemp, vget_low_f32(g_XMZero) );
6552 // Sqrt(vTemp)
6553 __n64 S0 = vrsqrte_f32(vTemp);
6554 __n64 P0 = vmul_f32( vTemp, S0 );
6555 __n64 R0 = vrsqrts_f32( P0, S0 );
6556 __n64 S1 = vmul_f32( S0, R0 );
6557 __n64 P1 = vmul_f32( vTemp, S1 );
6558 __n64 R1 = vrsqrts_f32( P1, S1 );
6559 __n64 S2 = vmul_f32( S1, R1 );
6560 vTemp = vmul_f32( vTemp, S2 );
6561 // R = RefractionIndex * IDotN + sqrt(R)
6562 vTemp = vmla_f32( vTemp, RIL, IDotN );
6563 // Result = RefractionIndex * Incident - Normal * R
6564 __n64 vResult = vmul_f32(RIL,IL);
6565 vResult = vmls_f32( vResult, vTemp, NL );
6566 vResult = vand_u32(vResult,vMask);
6567 return vcombine_f32(vResult, vResult);
6568#elif defined(_XM_SSE_INTRINSICS_)
6569 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
6570 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
6571 // Get the 2D Dot product of Incident-Normal
6572 XMVECTOR IDotN = XMVector2Dot(Incident, Normal);
6573 // vTemp = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
6574 XMVECTOR vTemp = _mm_mul_ps(IDotN,IDotN);
6575 vTemp = _mm_sub_ps(g_XMOne,vTemp);
6576 vTemp = _mm_mul_ps(vTemp,RefractionIndex);
6577 vTemp = _mm_mul_ps(vTemp,RefractionIndex);
6578 vTemp = _mm_sub_ps(g_XMOne,vTemp);
6579 // If any terms are <=0, sqrt() will fail, punt to zero
6580 XMVECTOR vMask = _mm_cmpgt_ps(vTemp,g_XMZero);
6581 // R = RefractionIndex * IDotN + sqrt(R)
6582 vTemp = _mm_sqrt_ps(vTemp);
6583 XMVECTOR vResult = _mm_mul_ps(RefractionIndex,IDotN);
6584 vTemp = _mm_add_ps(vTemp,vResult);
6585 // Result = RefractionIndex * Incident - Normal * R
6586 vResult = _mm_mul_ps(RefractionIndex,Incident);
6587 vTemp = _mm_mul_ps(vTemp,Normal);
6588 vResult = _mm_sub_ps(vResult,vTemp);
6589 vResult = _mm_and_ps(vResult,vMask);
6590 return vResult;
6591#else // _XM_VMX128_INTRINSICS_
6592#endif // _XM_VMX128_INTRINSICS_
6593}
6594
6595//------------------------------------------------------------------------------
6596
6597inline XMVECTOR XMVector2Orthogonal
6598(
6599 FXMVECTOR V
6600)
6601{
6602#if defined(_XM_NO_INTRINSICS_)
6603
6604 XMVECTOR Result;
6605 Result.vector4_f32[0] = -V.vector4_f32[1];
6606 Result.vector4_f32[1] = V.vector4_f32[0];
6607 Result.vector4_f32[2] = 0.f;
6608 Result.vector4_f32[3] = 0.f;
6609 return Result;
6610
6611#elif defined(_XM_ARM_NEON_INTRINSICS_)
6612 static const XMVECTORF32 Negate = { -1.f, 1.f, 0, 0 };
6613 const __n64 zero = vdup_n_f32(0);
6614
6615 __n64 VL = vget_low_f32( V );
6616 __n64 Result = vmul_f32( vrev64_f32( VL ), vget_low_f32( Negate ) );
6617 return vcombine_f32( Result, zero );
6618#elif defined(_XM_SSE_INTRINSICS_)
6619 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,2,0,1));
6620 vResult = _mm_mul_ps(vResult,g_XMNegateX);
6621 return vResult;
6622#else // _XM_VMX128_INTRINSICS_
6623#endif // _XM_VMX128_INTRINSICS_
6624}
6625
6626//------------------------------------------------------------------------------
6627
6628inline XMVECTOR XMVector2AngleBetweenNormalsEst
6629(
6630 FXMVECTOR N1,
6631 FXMVECTOR N2
6632)
6633{
6634#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6635
6636 XMVECTOR Result = XMVector2Dot(N1, N2);
6637 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
6638 Result = XMVectorACosEst(Result);
6639 return Result;
6640
6641#else // _XM_VMX128_INTRINSICS_
6642#endif // _XM_VMX128_INTRINSICS_
6643}
6644
6645//------------------------------------------------------------------------------
6646
6647inline XMVECTOR XMVector2AngleBetweenNormals
6648(
6649 FXMVECTOR N1,
6650 FXMVECTOR N2
6651)
6652{
6653#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6654
6655 XMVECTOR Result = XMVector2Dot(N1, N2);
6656 Result = XMVectorClamp(Result, g_XMNegativeOne, g_XMOne);
6657 Result = XMVectorACos(Result);
6658 return Result;
6659
6660#else // _XM_VMX128_INTRINSICS_
6661#endif // _XM_VMX128_INTRINSICS_
6662}
6663
6664//------------------------------------------------------------------------------
6665
6666inline XMVECTOR XMVector2AngleBetweenVectors
6667(
6668 FXMVECTOR V1,
6669 FXMVECTOR V2
6670)
6671{
6672#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6673
6674 XMVECTOR L1 = XMVector2ReciprocalLength(V1);
6675 XMVECTOR L2 = XMVector2ReciprocalLength(V2);
6676
6677 XMVECTOR Dot = XMVector2Dot(V1, V2);
6678
6679 L1 = XMVectorMultiply(L1, L2);
6680
6681 XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
6682 CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
6683
6684 return XMVectorACos(CosAngle);
6685
6686#else // _XM_VMX128_INTRINSICS_
6687#endif // _XM_VMX128_INTRINSICS_
6688}
6689
6690//------------------------------------------------------------------------------
6691
6692inline XMVECTOR XMVector2LinePointDistance
6693(
6694 FXMVECTOR LinePoint1,
6695 FXMVECTOR LinePoint2,
6696 FXMVECTOR Point
6697)
6698{
6699 // Given a vector PointVector from LinePoint1 to Point and a vector
6700 // LineVector from LinePoint1 to LinePoint2, the scaled distance
6701 // PointProjectionScale from LinePoint1 to the perpendicular projection
6702 // of PointVector onto the line is defined as:
6703 //
6704 // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
6705
6706#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6707
6708 XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
6709 XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
6710
6711 XMVECTOR LengthSq = XMVector2LengthSq(LineVector);
6712
6713 XMVECTOR PointProjectionScale = XMVector2Dot(PointVector, LineVector);
6714 PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
6715
6716 XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
6717 DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
6718
6719 return XMVector2Length(DistanceVector);
6720
6721#else // _XM_VMX128_INTRINSICS_
6722#endif // _XM_VMX128_INTRINSICS_
6723}
6724
6725//------------------------------------------------------------------------------
6726
6727inline XMVECTOR XMVector2IntersectLine
6728(
6729 FXMVECTOR Line1Point1,
6730 FXMVECTOR Line1Point2,
6731 FXMVECTOR Line2Point1,
6732 GXMVECTOR Line2Point2
6733)
6734{
6735#if defined(_XM_NO_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6736
6737 XMVECTOR V1 = XMVectorSubtract(Line1Point2, Line1Point1);
6738 XMVECTOR V2 = XMVectorSubtract(Line2Point2, Line2Point1);
6739 XMVECTOR V3 = XMVectorSubtract(Line1Point1, Line2Point1);
6740
6741 XMVECTOR C1 = XMVector2Cross(V1, V2);
6742 XMVECTOR C2 = XMVector2Cross(V2, V3);
6743
6744 XMVECTOR Result;
6745 const XMVECTOR Zero = XMVectorZero();
6746 if (XMVector2NearEqual(C1, Zero, g_XMEpsilon.v))
6747 {
6748 if (XMVector2NearEqual(C2, Zero, g_XMEpsilon.v))
6749 {
6750 // Coincident
6751 Result = g_XMInfinity.v;
6752 }
6753 else
6754 {
6755 // Parallel
6756 Result = g_XMQNaN.v;
6757 }
6758 }
6759 else
6760 {
6761 // Intersection point = Line1Point1 + V1 * (C2 / C1)
6762 XMVECTOR Scale = XMVectorReciprocal(C1);
6763 Scale = XMVectorMultiply(C2, Scale);
6764 Result = XMVectorMultiplyAdd(V1, Scale, Line1Point1);
6765 }
6766
6767 return Result;
6768
6769#elif defined(_XM_SSE_INTRINSICS_)
6770 XMVECTOR V1 = _mm_sub_ps(Line1Point2, Line1Point1);
6771 XMVECTOR V2 = _mm_sub_ps(Line2Point2, Line2Point1);
6772 XMVECTOR V3 = _mm_sub_ps(Line1Point1, Line2Point1);
6773 // Generate the cross products
6774 XMVECTOR C1 = XMVector2Cross(V1, V2);
6775 XMVECTOR C2 = XMVector2Cross(V2, V3);
6776 // If C1 is not close to epsilon, use the calculated value
6777 XMVECTOR vResultMask = _mm_setzero_ps();
6778 vResultMask = _mm_sub_ps(vResultMask,C1);
6779 vResultMask = _mm_max_ps(vResultMask,C1);
6780 // 0xFFFFFFFF if the calculated value is to be used
6781 vResultMask = _mm_cmpgt_ps(vResultMask,g_XMEpsilon);
6782 // If C1 is close to epsilon, which fail type is it? INFINITY or NAN?
6783 XMVECTOR vFailMask = _mm_setzero_ps();
6784 vFailMask = _mm_sub_ps(vFailMask,C2);
6785 vFailMask = _mm_max_ps(vFailMask,C2);
6786 vFailMask = _mm_cmple_ps(vFailMask,g_XMEpsilon);
6787 XMVECTOR vFail = _mm_and_ps(vFailMask,g_XMInfinity);
6788 vFailMask = _mm_andnot_ps(vFailMask,g_XMQNaN);
6789 // vFail is NAN or INF
6790 vFail = _mm_or_ps(vFail,vFailMask);
6791 // Intersection point = Line1Point1 + V1 * (C2 / C1)
6792 XMVECTOR vResult = _mm_div_ps(C2,C1);
6793 vResult = _mm_mul_ps(vResult,V1);
6794 vResult = _mm_add_ps(vResult,Line1Point1);
6795 // Use result, or failure value
6796 vResult = _mm_and_ps(vResult,vResultMask);
6797 vResultMask = _mm_andnot_ps(vResultMask,vFail);
6798 vResult = _mm_or_ps(vResult,vResultMask);
6799 return vResult;
6800#else // _XM_VMX128_INTRINSICS_
6801#endif // _XM_VMX128_INTRINSICS_
6802}
6803
6804//------------------------------------------------------------------------------
6805
6806inline XMVECTOR XMVector2Transform
6807(
6808 FXMVECTOR V,
6809 CXMMATRIX M
6810)
6811{
6812#if defined(_XM_NO_INTRINSICS_)
6813
6814 XMVECTOR Y = XMVectorSplatY(V);
6815 XMVECTOR X = XMVectorSplatX(V);
6816
6817 XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
6818 Result = XMVectorMultiplyAdd(X, M.r[0], Result);
6819
6820 return Result;
6821
6822#elif defined(_XM_ARM_NEON_INTRINSICS_)
6823 __n64 VL = vget_low_f32( V );
6824 __n128 Y = vdupq_lane_f32( VL, 1 );
6825 __n128 Result = vmlaq_f32( M.r[3], Y, M.r[1] );
6826 __n128 X = vdupq_lane_f32( VL, 0 );
6827 return vmlaq_f32( Result, X, M.r[0] );
6828#elif defined(_XM_SSE_INTRINSICS_)
6829 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
6830 vResult = _mm_mul_ps(vResult,M.r[0]);
6831 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
6832 vTemp = _mm_mul_ps(vTemp,M.r[1]);
6833 vResult = _mm_add_ps(vResult,vTemp);
6834 vResult = _mm_add_ps(vResult,M.r[3]);
6835 return vResult;
6836#else // _XM_VMX128_INTRINSICS_
6837#endif // _XM_VMX128_INTRINSICS_
6838}
6839
6840//------------------------------------------------------------------------------
6841
6842_Use_decl_annotations_
6843inline XMFLOAT4* XMVector2TransformStream
6844(
6845 XMFLOAT4* pOutputStream,
6846 size_t OutputStride,
6847 const XMFLOAT2* pInputStream,
6848 size_t InputStride,
6849 size_t VectorCount,
6850 CXMMATRIX M
6851)
6852{
6853 assert(pOutputStream != NULL);
6854 assert(pInputStream != NULL);
6855
6856#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6857
6858 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
6859 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
6860
6861 const XMVECTOR row0 = M.r[0];
6862 const XMVECTOR row1 = M.r[1];
6863 const XMVECTOR row3 = M.r[3];
6864
6865 for (size_t i = 0; i < VectorCount; i++)
6866 {
6867 XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
6868 XMVECTOR Y = XMVectorSplatY(V);
6869 XMVECTOR X = XMVectorSplatX(V);
6870
6871 XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
6872 Result = XMVectorMultiplyAdd(X, row0, Result);
6873
6874 XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
6875
6876 pInputVector += InputStride;
6877 pOutputVector += OutputStride;
6878 }
6879
6880 return pOutputStream;
6881
6882#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
6883#endif // _XM_VMX128_INTRINSICS_
6884}
6885
6886
6887//------------------------------------------------------------------------------
6888
6889inline XMVECTOR XMVector2TransformCoord
6890(
6891 FXMVECTOR V,
6892 CXMMATRIX M
6893)
6894{
6895#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6896
6897 XMVECTOR Y = XMVectorSplatY(V);
6898 XMVECTOR X = XMVectorSplatX(V);
6899
6900 XMVECTOR Result = XMVectorMultiplyAdd(Y, M.r[1], M.r[3]);
6901 Result = XMVectorMultiplyAdd(X, M.r[0], Result);
6902
6903 XMVECTOR W = XMVectorSplatW(Result);
6904 return XMVectorDivide( Result, W );
6905
6906#else // _XM_VMX128_INTRINSICS_
6907#endif // _XM_VMX128_INTRINSICS_
6908}
6909
6910//------------------------------------------------------------------------------
6911
6912_Use_decl_annotations_
6913inline XMFLOAT2* XMVector2TransformCoordStream
6914(
6915 XMFLOAT2* pOutputStream,
6916 size_t OutputStride,
6917 const XMFLOAT2* pInputStream,
6918 size_t InputStride,
6919 size_t VectorCount,
6920 CXMMATRIX M
6921)
6922{
6923 assert(pOutputStream != NULL);
6924 assert(pInputStream != NULL);
6925
6926#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
6927
6928 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
6929 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
6930
6931 const XMVECTOR row0 = M.r[0];
6932 const XMVECTOR row1 = M.r[1];
6933 const XMVECTOR row3 = M.r[3];
6934
6935 for (size_t i = 0; i < VectorCount; i++)
6936 {
6937 XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
6938 XMVECTOR Y = XMVectorSplatY(V);
6939 XMVECTOR X = XMVectorSplatX(V);
6940
6941 XMVECTOR Result = XMVectorMultiplyAdd(Y, row1, row3);
6942 Result = XMVectorMultiplyAdd(X, row0, Result);
6943
6944 XMVECTOR W = XMVectorSplatW(Result);
6945
6946 Result = XMVectorDivide(Result, W);
6947
6948 XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
6949
6950 pInputVector += InputStride;
6951 pOutputVector += OutputStride;
6952 }
6953
6954 return pOutputStream;
6955
6956#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
6957#endif // _XM_VMX128_INTRINSICS_
6958}
6959
6960//------------------------------------------------------------------------------
6961
6962inline XMVECTOR XMVector2TransformNormal
6963(
6964 FXMVECTOR V,
6965 CXMMATRIX M
6966)
6967{
6968#if defined(_XM_NO_INTRINSICS_)
6969
6970 XMVECTOR Y = XMVectorSplatY(V);
6971 XMVECTOR X = XMVectorSplatX(V);
6972
6973 XMVECTOR Result = XMVectorMultiply(Y, M.r[1]);
6974 Result = XMVectorMultiplyAdd(X, M.r[0], Result);
6975
6976 return Result;
6977
6978#elif defined(_XM_ARM_NEON_INTRINSICS_)
6979 __n64 VL = vget_low_f32( V );
6980 __n128 Y = vdupq_lane_f32( VL, 1 );
6981 __n128 Result = vmulq_f32( Y, M.r[1] );
6982 __n128 X = vdupq_lane_f32( VL, 0 );
6983 return vmlaq_f32( Result, X, M.r[0] );
6984#elif defined(_XM_SSE_INTRINSICS_)
6985 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
6986 vResult = _mm_mul_ps(vResult,M.r[0]);
6987 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
6988 vTemp = _mm_mul_ps(vTemp,M.r[1]);
6989 vResult = _mm_add_ps(vResult,vTemp);
6990 return vResult;
6991#else // _XM_VMX128_INTRINSICS_
6992#endif // _XM_VMX128_INTRINSICS_
6993}
6994
6995//------------------------------------------------------------------------------
6996
6997_Use_decl_annotations_
6998inline XMFLOAT2* XMVector2TransformNormalStream
6999(
7000 XMFLOAT2* pOutputStream,
7001 size_t OutputStride,
7002 const XMFLOAT2* pInputStream,
7003 size_t InputStride,
7004 size_t VectorCount,
7005 CXMMATRIX M
7006)
7007{
7008 assert(pOutputStream != NULL);
7009 assert(pInputStream != NULL);
7010
7011#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
7012
7013 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
7014 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
7015
7016 const XMVECTOR row0 = M.r[0];
7017 const XMVECTOR row1 = M.r[1];
7018
7019 for (size_t i = 0; i < VectorCount; i++)
7020 {
7021 XMVECTOR V = XMLoadFloat2((const XMFLOAT2*)pInputVector);
7022 XMVECTOR Y = XMVectorSplatY(V);
7023 XMVECTOR X = XMVectorSplatX(V);
7024
7025 XMVECTOR Result = XMVectorMultiply(Y, row1);
7026 Result = XMVectorMultiplyAdd(X, row0, Result);
7027
7028 XMStoreFloat2((XMFLOAT2*)pOutputVector, Result);
7029
7030 pInputVector += InputStride;
7031 pOutputVector += OutputStride;
7032 }
7033
7034 return pOutputStream;
7035
7036#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
7037#endif // _XM_VMX128_INTRINSICS_
7038}
7039
7040/****************************************************************************
7041 *
7042 * 3D Vector
7043 *
7044 ****************************************************************************/
7045
7046//------------------------------------------------------------------------------
7047// Comparison operations
7048//------------------------------------------------------------------------------
7049
7050//------------------------------------------------------------------------------
7051
7052inline bool XMVector3Equal
7053(
7054 FXMVECTOR V1,
7055 FXMVECTOR V2
7056)
7057{
7058#if defined(_XM_NO_INTRINSICS_)
7059 return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2])) != 0);
7060#elif defined(_XM_ARM_NEON_INTRINSICS_)
7061 __n128 vResult = vceqq_f32( V1, V2 );
7062 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7063 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7064 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7065#elif defined(_XM_SSE_INTRINSICS_)
7066 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
7067 return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
7068#else // _XM_VMX128_INTRINSICS_
7069 return XMComparisonAllTrue(XMVector3EqualR(V1, V2));
7070#endif
7071}
7072
7073//------------------------------------------------------------------------------
7074
7075inline uint32_t XMVector3EqualR
7076(
7077 FXMVECTOR V1,
7078 FXMVECTOR V2
7079)
7080{
7081#if defined(_XM_NO_INTRINSICS_)
7082 uint32_t CR = 0;
7083 if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
7084 (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
7085 (V1.vector4_f32[2] == V2.vector4_f32[2]))
7086 {
7087 CR = XM_CRMASK_CR6TRUE;
7088 }
7089 else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
7090 (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
7091 (V1.vector4_f32[2] != V2.vector4_f32[2]))
7092 {
7093 CR = XM_CRMASK_CR6FALSE;
7094 }
7095 return CR;
7096#elif defined(_XM_ARM_NEON_INTRINSICS_)
7097 __n128 vResult = vceqq_f32( V1, V2 );
7098 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7099 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7100 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
7101
7102 uint32_t CR = 0;
7103 if ( r == 0xFFFFFFU )
7104 {
7105 CR = XM_CRMASK_CR6TRUE;
7106 }
7107 else if ( !r )
7108 {
7109 CR = XM_CRMASK_CR6FALSE;
7110 }
7111 return CR;
7112#elif defined(_XM_SSE_INTRINSICS_)
7113 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
7114 int iTest = _mm_movemask_ps(vTemp)&7;
7115 uint32_t CR = 0;
7116 if (iTest==7)
7117 {
7118 CR = XM_CRMASK_CR6TRUE;
7119 }
7120 else if (!iTest)
7121 {
7122 CR = XM_CRMASK_CR6FALSE;
7123 }
7124 return CR;
7125#else // _XM_VMX128_INTRINSICS_
7126#endif // _XM_VMX128_INTRINSICS_
7127}
7128
7129//------------------------------------------------------------------------------
7130
7131inline bool XMVector3EqualInt
7132(
7133 FXMVECTOR V1,
7134 FXMVECTOR V2
7135)
7136{
7137#if defined(_XM_NO_INTRINSICS_)
7138 return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2])) != 0);
7139#elif defined(_XM_ARM_NEON_INTRINSICS_)
7140 __n128 vResult = vceqq_u32( V1, V2 );
7141 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7142 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7143 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7144#elif defined(_XM_SSE_INTRINSICS_)
7145 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
7146 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)==7) != 0);
7147#else // _XM_VMX128_INTRINSICS_
7148 return XMComparisonAllTrue(XMVector3EqualIntR(V1, V2));
7149#endif
7150}
7151
7152//------------------------------------------------------------------------------
7153
7154inline uint32_t XMVector3EqualIntR
7155(
7156 FXMVECTOR V1,
7157 FXMVECTOR V2
7158)
7159{
7160#if defined(_XM_NO_INTRINSICS_)
7161 uint32_t CR = 0;
7162 if ((V1.vector4_u32[0] == V2.vector4_u32[0]) &&
7163 (V1.vector4_u32[1] == V2.vector4_u32[1]) &&
7164 (V1.vector4_u32[2] == V2.vector4_u32[2]))
7165 {
7166 CR = XM_CRMASK_CR6TRUE;
7167 }
7168 else if ((V1.vector4_u32[0] != V2.vector4_u32[0]) &&
7169 (V1.vector4_u32[1] != V2.vector4_u32[1]) &&
7170 (V1.vector4_u32[2] != V2.vector4_u32[2]))
7171 {
7172 CR = XM_CRMASK_CR6FALSE;
7173 }
7174 return CR;
7175#elif defined(_XM_ARM_NEON_INTRINSICS_)
7176 __n128 vResult = vceqq_u32( V1, V2 );
7177 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7178 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7179 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
7180
7181 uint32_t CR = 0;
7182 if ( r == 0xFFFFFFU )
7183 {
7184 CR = XM_CRMASK_CR6TRUE;
7185 }
7186 else if ( !r )
7187 {
7188 CR = XM_CRMASK_CR6FALSE;
7189 }
7190 return CR;
7191#elif defined(_XM_SSE_INTRINSICS_)
7192 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
7193 int iTemp = _mm_movemask_ps(_mm_castsi128_ps(vTemp))&7;
7194 uint32_t CR = 0;
7195 if (iTemp==7)
7196 {
7197 CR = XM_CRMASK_CR6TRUE;
7198 }
7199 else if (!iTemp)
7200 {
7201 CR = XM_CRMASK_CR6FALSE;
7202 }
7203 return CR;
7204#else // _XM_VMX128_INTRINSICS_
7205#endif // _XM_VMX128_INTRINSICS_
7206}
7207
7208//------------------------------------------------------------------------------
7209
7210inline bool XMVector3NearEqual
7211(
7212 FXMVECTOR V1,
7213 FXMVECTOR V2,
7214 FXMVECTOR Epsilon
7215)
7216{
7217#if defined(_XM_NO_INTRINSICS_)
7218 float dx, dy, dz;
7219
7220 dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
7221 dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
7222 dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
7223 return (((dx <= Epsilon.vector4_f32[0]) &&
7224 (dy <= Epsilon.vector4_f32[1]) &&
7225 (dz <= Epsilon.vector4_f32[2])) != 0);
7226#elif defined(_XM_ARM_NEON_INTRINSICS_)
7227 __n128 vDelta = vsubq_f32( V1, V2 );
7228 __n128 vResult = vacleq_f32( vDelta, Epsilon );
7229 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7230 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7231 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7232#elif defined(_XM_SSE_INTRINSICS_)
7233 // Get the difference
7234 XMVECTOR vDelta = _mm_sub_ps(V1,V2);
7235 // Get the absolute value of the difference
7236 XMVECTOR vTemp = _mm_setzero_ps();
7237 vTemp = _mm_sub_ps(vTemp,vDelta);
7238 vTemp = _mm_max_ps(vTemp,vDelta);
7239 vTemp = _mm_cmple_ps(vTemp,Epsilon);
7240 // w is don't care
7241 return (((_mm_movemask_ps(vTemp)&7)==0x7) != 0);
7242#else // _XM_VMX128_INTRINSICS_
7243#endif // _XM_VMX128_INTRINSICS_
7244}
7245
7246//------------------------------------------------------------------------------
7247
7248inline bool XMVector3NotEqual
7249(
7250 FXMVECTOR V1,
7251 FXMVECTOR V2
7252)
7253{
7254#if defined(_XM_NO_INTRINSICS_)
7255 return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2])) != 0);
7256#elif defined(_XM_ARM_NEON_INTRINSICS_)
7257 __n128 vResult = vceqq_f32( V1, V2 );
7258 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7259 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7260 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
7261#elif defined(_XM_SSE_INTRINSICS_)
7262 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
7263 return (((_mm_movemask_ps(vTemp)&7)!=7) != 0);
7264#else // _XM_VMX128_INTRINSICS_
7265 return XMComparisonAnyFalse(XMVector3EqualR(V1, V2));
7266#endif
7267}
7268
7269//------------------------------------------------------------------------------
7270
7271inline bool XMVector3NotEqualInt
7272(
7273 FXMVECTOR V1,
7274 FXMVECTOR V2
7275)
7276{
7277#if defined(_XM_NO_INTRINSICS_)
7278 return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2])) != 0);
7279#elif defined(_XM_ARM_NEON_INTRINSICS_)
7280 __n128 vResult = vceqq_u32( V1, V2 );
7281 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7282 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7283 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
7284#elif defined(_XM_SSE_INTRINSICS_)
7285 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
7286 return (((_mm_movemask_ps(_mm_castsi128_ps(vTemp))&7)!=7) != 0);
7287#else // _XM_VMX128_INTRINSICS_
7288 return XMComparisonAnyFalse(XMVector3EqualIntR(V1, V2));
7289#endif
7290}
7291
7292//------------------------------------------------------------------------------
7293
7294inline bool XMVector3Greater
7295(
7296 FXMVECTOR V1,
7297 FXMVECTOR V2
7298)
7299{
7300#if defined(_XM_NO_INTRINSICS_)
7301 return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2])) != 0);
7302#elif defined(_XM_ARM_NEON_INTRINSICS_)
7303 __n128 vResult = vcgtq_f32( V1, V2 );
7304 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7305 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7306 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7307#elif defined(_XM_SSE_INTRINSICS_)
7308 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
7309 return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
7310#else // _XM_VMX128_INTRINSICS_
7311 return XMComparisonAllTrue(XMVector3GreaterR(V1, V2));
7312#endif
7313}
7314
7315//------------------------------------------------------------------------------
7316
7317inline uint32_t XMVector3GreaterR
7318(
7319 FXMVECTOR V1,
7320 FXMVECTOR V2
7321)
7322{
7323#if defined(_XM_NO_INTRINSICS_)
7324 uint32_t CR = 0;
7325 if ((V1.vector4_f32[0] > V2.vector4_f32[0]) &&
7326 (V1.vector4_f32[1] > V2.vector4_f32[1]) &&
7327 (V1.vector4_f32[2] > V2.vector4_f32[2]))
7328 {
7329 CR = XM_CRMASK_CR6TRUE;
7330 }
7331 else if ((V1.vector4_f32[0] <= V2.vector4_f32[0]) &&
7332 (V1.vector4_f32[1] <= V2.vector4_f32[1]) &&
7333 (V1.vector4_f32[2] <= V2.vector4_f32[2]))
7334 {
7335 CR = XM_CRMASK_CR6FALSE;
7336 }
7337 return CR;
7338
7339#elif defined(_XM_ARM_NEON_INTRINSICS_)
7340 __n128 vResult = vcgtq_f32( V1, V2 );
7341 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7342 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7343 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
7344
7345 uint32_t CR = 0;
7346 if ( r == 0xFFFFFFU )
7347 {
7348 CR = XM_CRMASK_CR6TRUE;
7349 }
7350 else if ( !r )
7351 {
7352 CR = XM_CRMASK_CR6FALSE;
7353 }
7354 return CR;
7355#elif defined(_XM_SSE_INTRINSICS_)
7356 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
7357 uint32_t CR = 0;
7358 int iTest = _mm_movemask_ps(vTemp)&7;
7359 if (iTest==7)
7360 {
7361 CR = XM_CRMASK_CR6TRUE;
7362 }
7363 else if (!iTest)
7364 {
7365 CR = XM_CRMASK_CR6FALSE;
7366 }
7367 return CR;
7368#else // _XM_VMX128_INTRINSICS_
7369#endif // _XM_VMX128_INTRINSICS_
7370}
7371
7372//------------------------------------------------------------------------------
7373
7374inline bool XMVector3GreaterOrEqual
7375(
7376 FXMVECTOR V1,
7377 FXMVECTOR V2
7378)
7379{
7380#if defined(_XM_NO_INTRINSICS_)
7381 return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2])) != 0);
7382#elif defined(_XM_ARM_NEON_INTRINSICS_)
7383 __n128 vResult = vcgeq_f32( V1, V2 );
7384 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7385 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7386 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7387#elif defined(_XM_SSE_INTRINSICS_)
7388 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
7389 return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
7390#else // _XM_VMX128_INTRINSICS_
7391 return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V1, V2));
7392#endif
7393}
7394
7395//------------------------------------------------------------------------------
7396
7397inline uint32_t XMVector3GreaterOrEqualR
7398(
7399 FXMVECTOR V1,
7400 FXMVECTOR V2
7401)
7402{
7403#if defined(_XM_NO_INTRINSICS_)
7404
7405 uint32_t CR = 0;
7406 if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
7407 (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
7408 (V1.vector4_f32[2] >= V2.vector4_f32[2]))
7409 {
7410 CR = XM_CRMASK_CR6TRUE;
7411 }
7412 else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
7413 (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
7414 (V1.vector4_f32[2] < V2.vector4_f32[2]))
7415 {
7416 CR = XM_CRMASK_CR6FALSE;
7417 }
7418 return CR;
7419
7420#elif defined(_XM_ARM_NEON_INTRINSICS_)
7421 __n128 vResult = vcgeq_f32( V1, V2 );
7422 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7423 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7424 uint32_t r = vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU;
7425
7426 uint32_t CR = 0;
7427 if ( r == 0xFFFFFFU )
7428 {
7429 CR = XM_CRMASK_CR6TRUE;
7430 }
7431 else if ( !r )
7432 {
7433 CR = XM_CRMASK_CR6FALSE;
7434 }
7435 return CR;
7436#elif defined(_XM_SSE_INTRINSICS_)
7437 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
7438 uint32_t CR = 0;
7439 int iTest = _mm_movemask_ps(vTemp)&7;
7440 if (iTest==7)
7441 {
7442 CR = XM_CRMASK_CR6TRUE;
7443 }
7444 else if (!iTest)
7445 {
7446 CR = XM_CRMASK_CR6FALSE;
7447 }
7448 return CR;
7449#else // _XM_VMX128_INTRINSICS_
7450#endif // _XM_VMX128_INTRINSICS_
7451}
7452
7453//------------------------------------------------------------------------------
7454
7455inline bool XMVector3Less
7456(
7457 FXMVECTOR V1,
7458 FXMVECTOR V2
7459)
7460{
7461#if defined(_XM_NO_INTRINSICS_)
7462 return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2])) != 0);
7463#elif defined(_XM_ARM_NEON_INTRINSICS_)
7464 __n128 vResult = vcltq_f32( V1, V2 );
7465 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7466 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7467 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7468#elif defined(_XM_SSE_INTRINSICS_)
7469 XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
7470 return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
7471#else // _XM_VMX128_INTRINSICS_
7472 return XMComparisonAllTrue(XMVector3GreaterR(V2, V1));
7473#endif
7474}
7475
7476//------------------------------------------------------------------------------
7477
7478inline bool XMVector3LessOrEqual
7479(
7480 FXMVECTOR V1,
7481 FXMVECTOR V2
7482)
7483{
7484#if defined(_XM_NO_INTRINSICS_)
7485 return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2])) != 0);
7486#elif defined(_XM_ARM_NEON_INTRINSICS_)
7487 __n128 vResult = vcleq_f32( V1, V2 );
7488 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
7489 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7490 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7491#elif defined(_XM_SSE_INTRINSICS_)
7492 XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
7493 return (((_mm_movemask_ps(vTemp)&7)==7) != 0);
7494#else // _XM_VMX128_INTRINSICS_
7495 return XMComparisonAllTrue(XMVector3GreaterOrEqualR(V2, V1));
7496#endif
7497}
7498
7499//------------------------------------------------------------------------------
7500
7501inline bool XMVector3InBounds
7502(
7503 FXMVECTOR V,
7504 FXMVECTOR Bounds
7505)
7506{
7507#if defined(_XM_NO_INTRINSICS_)
7508 return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
7509 (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
7510 (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2])) != 0);
7511#elif defined(_XM_ARM_NEON_INTRINSICS_)
7512 // Test if less than or equal
7513 __n128 vTemp1 = vcleq_f32(V,Bounds);
7514 // Negate the bounds
7515 __n128 vTemp2 = vnegq_f32(Bounds);
7516 // Test if greater or equal (Reversed)
7517 vTemp2 = vcleq_f32(vTemp2,V);
7518 // Blend answers
7519 vTemp1 = vandq_u32(vTemp1,vTemp2);
7520 // in bounds?
7521 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
7522 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7523 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) == 0xFFFFFFU );
7524#elif defined(_XM_SSE_INTRINSICS_)
7525 // Test if less than or equal
7526 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
7527 // Negate the bounds
7528 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
7529 // Test if greater or equal (Reversed)
7530 vTemp2 = _mm_cmple_ps(vTemp2,V);
7531 // Blend answers
7532 vTemp1 = _mm_and_ps(vTemp1,vTemp2);
7533 // x,y and z in bounds? (w is don't care)
7534 return (((_mm_movemask_ps(vTemp1)&0x7)==0x7) != 0);
7535#else
7536 return XMComparisonAllInBounds(XMVector3InBoundsR(V, Bounds));
7537#endif
7538}
7539
7540
7541//------------------------------------------------------------------------------
7542
7543inline bool XMVector3IsNaN
7544(
7545 FXMVECTOR V
7546)
7547{
7548#if defined(_XM_NO_INTRINSICS_)
7549
7550 return (XMISNAN(V.vector4_f32[0]) ||
7551 XMISNAN(V.vector4_f32[1]) ||
7552 XMISNAN(V.vector4_f32[2]));
7553
7554#elif defined(_XM_ARM_NEON_INTRINSICS_)
7555 // Test against itself. NaN is always not equal
7556 __n128 vTempNan = vceqq_f32( V, V );
7557 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
7558 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7559 // If x or y or z are NaN, the mask is zero
7560 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0xFFFFFFU );
7561#elif defined(_XM_SSE_INTRINSICS_)
7562 // Test against itself. NaN is always not equal
7563 XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
7564 // If x or y or z are NaN, the mask is non-zero
7565 return ((_mm_movemask_ps(vTempNan)&7) != 0);
7566#else // _XM_VMX128_INTRINSICS_
7567#endif // _XM_VMX128_INTRINSICS_
7568}
7569
7570//------------------------------------------------------------------------------
7571
7572inline bool XMVector3IsInfinite
7573(
7574 FXMVECTOR V
7575)
7576{
7577#if defined(_XM_NO_INTRINSICS_)
7578 return (XMISINF(V.vector4_f32[0]) ||
7579 XMISINF(V.vector4_f32[1]) ||
7580 XMISINF(V.vector4_f32[2]));
7581#elif defined(_XM_ARM_NEON_INTRINSICS_)
7582 // Mask off the sign bit
7583 __n128 vTempInf = vandq_u32( V, g_XMAbsMask );
7584 // Compare to infinity
7585 vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
7586 // If any are infinity, the signs are true.
7587 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
7588 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
7589 return ( (vget_lane_u32(vTemp.val[1], 1) & 0xFFFFFFU) != 0 );
7590#elif defined(_XM_SSE_INTRINSICS_)
7591 // Mask off the sign bit
7592 __m128 vTemp = _mm_and_ps(V,g_XMAbsMask);
7593 // Compare to infinity
7594 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
7595 // If x,y or z are infinity, the signs are true.
7596 return ((_mm_movemask_ps(vTemp)&7) != 0);
7597#else // _XM_VMX128_INTRINSICS_
7598#endif // _XM_VMX128_INTRINSICS_
7599}
7600
7601//------------------------------------------------------------------------------
7602// Computation operations
7603//------------------------------------------------------------------------------
7604
7605//------------------------------------------------------------------------------
7606
7607inline XMVECTOR XMVector3Dot
7608(
7609 FXMVECTOR V1,
7610 FXMVECTOR V2
7611)
7612{
7613#if defined(_XM_NO_INTRINSICS_)
7614 float fValue = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2];
7615 XMVECTOR vResult = {
7616 fValue,
7617 fValue,
7618 fValue,
7619 fValue
7620 };
7621 return vResult;
7622
7623#elif defined(_XM_ARM_NEON_INTRINSICS_)
7624 __n128 vTemp = vmulq_f32( V1, V2 );
7625 __n64 v1 = vget_low_f32( vTemp );
7626 __n64 v2 = vget_high_f32( vTemp );
7627 v1 = vpadd_f32( v1, v1 );
7628 v2 = vdup_lane_f32( v2, 0 );
7629 v1 = vadd_f32( v1, v2 );
7630 return vcombine_f32( v1, v1 );
7631#elif defined(_XM_SSE_INTRINSICS_)
7632 // Perform the dot product
7633 XMVECTOR vDot = _mm_mul_ps(V1,V2);
7634 // x=Dot.vector4_f32[1], y=Dot.vector4_f32[2]
7635 XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
7636 // Result.vector4_f32[0] = x+y
7637 vDot = _mm_add_ss(vDot,vTemp);
7638 // x=Dot.vector4_f32[2]
7639 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
7640 // Result.vector4_f32[0] = (x+y)+z
7641 vDot = _mm_add_ss(vDot,vTemp);
7642 // Splat x
7643 return XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
7644#else // _XM_VMX128_INTRINSICS_
7645#endif // _XM_VMX128_INTRINSICS_
7646}
7647
7648//------------------------------------------------------------------------------
7649
7650inline XMVECTOR XMVector3Cross
7651(
7652 FXMVECTOR V1,
7653 FXMVECTOR V2
7654)
7655{
7656 // [ V1.y*V2.z - V1.z*V2.y, V1.z*V2.x - V1.x*V2.z, V1.x*V2.y - V1.y*V2.x ]
7657
7658#if defined(_XM_NO_INTRINSICS_)
7659 XMVECTOR vResult = {
7660 (V1.vector4_f32[1] * V2.vector4_f32[2]) - (V1.vector4_f32[2] * V2.vector4_f32[1]),
7661 (V1.vector4_f32[2] * V2.vector4_f32[0]) - (V1.vector4_f32[0] * V2.vector4_f32[2]),
7662 (V1.vector4_f32[0] * V2.vector4_f32[1]) - (V1.vector4_f32[1] * V2.vector4_f32[0]),
7663 0.0f
7664 };
7665 return vResult;
7666#elif defined(_XM_ARM_NEON_INTRINSICS_)
7667 __n64 v1xy = vget_low_f32(V1);
7668 __n64 v2xy = vget_low_f32(V2);
7669
7670 __n64 v1yx = vrev64_f32( v1xy );
7671 __n64 v2yx = vrev64_f32( v2xy );
7672
7673 __n64 v1zz = vdup_lane_f32( vget_high_f32(V1), 0 );
7674 __n64 v2zz = vdup_lane_f32( vget_high_f32(V2), 0 );
7675
7676 __n128 vResult = vmulq_f32( vcombine_f32(v1yx,v1xy), vcombine_f32(v2zz,v2yx) );
7677 vResult = vmlsq_f32( vResult, vcombine_f32(v1zz,v1yx), vcombine_f32(v2yx,v2xy) );
7678 return veorq_u32( vResult, g_XMFlipY );
7679#elif defined(_XM_SSE_INTRINSICS_)
7680 // y1,z1,x1,w1
7681 XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(3,0,2,1));
7682 // z2,x2,y2,w2
7683 XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(3,1,0,2));
7684 // Perform the left operation
7685 XMVECTOR vResult = _mm_mul_ps(vTemp1,vTemp2);
7686 // z1,x1,y1,w1
7687 vTemp1 = XM_PERMUTE_PS(vTemp1,_MM_SHUFFLE(3,0,2,1));
7688 // y2,z2,x2,w2
7689 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(3,1,0,2));
7690 // Perform the right operation
7691 vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
7692 // Subract the right from left, and return answer
7693 vResult = _mm_sub_ps(vResult,vTemp1);
7694 // Set w to zero
7695 return _mm_and_ps(vResult,g_XMMask3);
7696#else // _XM_VMX128_INTRINSICS_
7697#endif // _XM_VMX128_INTRINSICS_
7698}
7699
7700//------------------------------------------------------------------------------
7701
7702inline XMVECTOR XMVector3LengthSq
7703(
7704 FXMVECTOR V
7705)
7706{
7707 return XMVector3Dot(V, V);
7708}
7709
7710//------------------------------------------------------------------------------
7711
7712inline XMVECTOR XMVector3ReciprocalLengthEst
7713(
7714 FXMVECTOR V
7715)
7716{
7717#if defined(_XM_NO_INTRINSICS_)
7718
7719 XMVECTOR Result;
7720
7721 Result = XMVector3LengthSq(V);
7722 Result = XMVectorReciprocalSqrtEst(Result);
7723
7724 return Result;
7725
7726#elif defined(_XM_ARM_NEON_INTRINSICS_)
7727 // Dot3
7728 __n128 vTemp = vmulq_f32( V, V );
7729 __n64 v1 = vget_low_f32( vTemp );
7730 __n64 v2 = vget_high_f32( vTemp );
7731 v1 = vpadd_f32( v1, v1 );
7732 v2 = vdup_lane_f32( v2, 0 );
7733 v1 = vadd_f32( v1, v2 );
7734 // Reciprocal sqrt (estimate)
7735 v2 = vrsqrte_f32( v1 );
7736 return vcombine_f32(v2, v2);
7737#elif defined(_XM_SSE_INTRINSICS_)
7738 // Perform the dot product on x,y and z
7739 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
7740 // vTemp has z and y
7741 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
7742 // x+z, y
7743 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
7744 // y,y,y,y
7745 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
7746 // x+z+y,??,??,??
7747 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
7748 // Splat the length squared
7749 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
7750 // Get the reciprocal
7751 vLengthSq = _mm_rsqrt_ps(vLengthSq);
7752 return vLengthSq;
7753#else // _XM_VMX128_INTRINSICS_
7754#endif // _XM_VMX128_INTRINSICS_
7755}
7756
7757//------------------------------------------------------------------------------
7758
7759inline XMVECTOR XMVector3ReciprocalLength
7760(
7761 FXMVECTOR V
7762)
7763{
7764#if defined(_XM_NO_INTRINSICS_)
7765
7766 XMVECTOR Result;
7767
7768 Result = XMVector3LengthSq(V);
7769 Result = XMVectorReciprocalSqrt(Result);
7770
7771 return Result;
7772
7773#elif defined(_XM_ARM_NEON_INTRINSICS_)
7774 // Dot3
7775 __n128 vTemp = vmulq_f32( V, V );
7776 __n64 v1 = vget_low_f32( vTemp );
7777 __n64 v2 = vget_high_f32( vTemp );
7778 v1 = vpadd_f32( v1, v1 );
7779 v2 = vdup_lane_f32( v2, 0 );
7780 v1 = vadd_f32( v1, v2 );
7781 // Reciprocal sqrt
7782 __n64 S0 = vrsqrte_f32(v1);
7783 __n64 P0 = vmul_f32( v1, S0 );
7784 __n64 R0 = vrsqrts_f32( P0, S0 );
7785 __n64 S1 = vmul_f32( S0, R0 );
7786 __n64 P1 = vmul_f32( v1, S1 );
7787 __n64 R1 = vrsqrts_f32( P1, S1 );
7788 __n64 Result = vmul_f32( S1, R1 );
7789 return vcombine_f32( Result, Result );
7790#elif defined(_XM_SSE_INTRINSICS_)
7791 // Perform the dot product
7792 XMVECTOR vDot = _mm_mul_ps(V,V);
7793 // x=Dot.y, y=Dot.z
7794 XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
7795 // Result.x = x+y
7796 vDot = _mm_add_ss(vDot,vTemp);
7797 // x=Dot.z
7798 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
7799 // Result.x = (x+y)+z
7800 vDot = _mm_add_ss(vDot,vTemp);
7801 // Splat x
7802 vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
7803 // Get the reciprocal
7804 vDot = _mm_sqrt_ps(vDot);
7805 // Get the reciprocal
7806 vDot = _mm_div_ps(g_XMOne,vDot);
7807 return vDot;
7808#else // _XM_VMX128_INTRINSICS_
7809#endif // _XM_VMX128_INTRINSICS_
7810}
7811
7812//------------------------------------------------------------------------------
7813
7814inline XMVECTOR XMVector3LengthEst
7815(
7816 FXMVECTOR V
7817)
7818{
7819#if defined(_XM_NO_INTRINSICS_)
7820
7821 XMVECTOR Result;
7822
7823 Result = XMVector3LengthSq(V);
7824 Result = XMVectorSqrtEst(Result);
7825
7826 return Result;
7827
7828#elif defined(_XM_ARM_NEON_INTRINSICS_)
7829 // Dot3
7830 __n128 vTemp = vmulq_f32( V, V );
7831 __n64 v1 = vget_low_f32( vTemp );
7832 __n64 v2 = vget_high_f32( vTemp );
7833 v1 = vpadd_f32( v1, v1 );
7834 v2 = vdup_lane_f32( v2, 0 );
7835 v1 = vadd_f32( v1, v2 );
7836 const __n64 zero = vdup_n_u32(0);
7837 __n64 VEqualsZero = vceq_f32( v1, zero );
7838 // Sqrt (estimate)
7839 __n64 Result = vrsqrte_f32( v1 );
7840 Result = vmul_f32( v1, Result );
7841 Result = vbsl_f32( VEqualsZero, zero, Result );
7842 return vcombine_f32( Result, Result );
7843#elif defined(_XM_SSE_INTRINSICS_)
7844 // Perform the dot product on x,y and z
7845 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
7846 // vTemp has z and y
7847 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
7848 // x+z, y
7849 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
7850 // y,y,y,y
7851 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
7852 // x+z+y,??,??,??
7853 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
7854 // Splat the length squared
7855 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
7856 // Get the length
7857 vLengthSq = _mm_sqrt_ps(vLengthSq);
7858 return vLengthSq;
7859#else // _XM_VMX128_INTRINSICS_
7860#endif // _XM_VMX128_INTRINSICS_
7861}
7862
7863//------------------------------------------------------------------------------
7864
7865inline XMVECTOR XMVector3Length
7866(
7867 FXMVECTOR V
7868)
7869{
7870#if defined(_XM_NO_INTRINSICS_)
7871
7872 XMVECTOR Result;
7873
7874 Result = XMVector3LengthSq(V);
7875 Result = XMVectorSqrt(Result);
7876
7877 return Result;
7878
7879#elif defined(_XM_ARM_NEON_INTRINSICS_)
7880 // Dot3
7881 __n128 vTemp = vmulq_f32( V, V );
7882 __n64 v1 = vget_low_f32( vTemp );
7883 __n64 v2 = vget_high_f32( vTemp );
7884 v1 = vpadd_f32( v1, v1 );
7885 v2 = vdup_lane_f32( v2, 0 );
7886 v1 = vadd_f32( v1, v2 );
7887 const __n64 zero = vdup_n_u32(0);
7888 __n64 VEqualsZero = vceq_f32( v1, zero );
7889 // Sqrt
7890 __n64 S0 = vrsqrte_f32( v1 );
7891 __n64 P0 = vmul_f32( v1, S0 );
7892 __n64 R0 = vrsqrts_f32( P0, S0 );
7893 __n64 S1 = vmul_f32( S0, R0 );
7894 __n64 P1 = vmul_f32( v1, S1 );
7895 __n64 R1 = vrsqrts_f32( P1, S1 );
7896 __n64 Result = vmul_f32( S1, R1 );
7897 Result = vmul_f32( v1, Result );
7898 Result = vbsl_f32( VEqualsZero, zero, Result );
7899 return vcombine_f32( Result, Result );
7900#elif defined(_XM_SSE_INTRINSICS_)
7901 // Perform the dot product on x,y and z
7902 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
7903 // vTemp has z and y
7904 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,2,1,2));
7905 // x+z, y
7906 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
7907 // y,y,y,y
7908 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
7909 // x+z+y,??,??,??
7910 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
7911 // Splat the length squared
7912 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
7913 // Get the length
7914 vLengthSq = _mm_sqrt_ps(vLengthSq);
7915 return vLengthSq;
7916#else // _XM_VMX128_INTRINSICS_
7917#endif // _XM_VMX128_INTRINSICS_
7918}
7919
7920//------------------------------------------------------------------------------
7921// XMVector3NormalizeEst uses a reciprocal estimate and
7922// returns QNaN on zero and infinite vectors.
7923
7924inline XMVECTOR XMVector3NormalizeEst
7925(
7926 FXMVECTOR V
7927)
7928{
7929#if defined(_XM_NO_INTRINSICS_)
7930
7931 XMVECTOR Result;
7932 Result = XMVector3ReciprocalLength(V);
7933 Result = XMVectorMultiply(V, Result);
7934 return Result;
7935
7936#elif defined(_XM_ARM_NEON_INTRINSICS_)
7937 // Dot3
7938 __n128 vTemp = vmulq_f32( V, V );
7939 __n64 v1 = vget_low_f32( vTemp );
7940 __n64 v2 = vget_high_f32( vTemp );
7941 v1 = vpadd_f32( v1, v1 );
7942 v2 = vdup_lane_f32( v2, 0 );
7943 v1 = vadd_f32( v1, v2 );
7944 // Reciprocal sqrt (estimate)
7945 v2 = vrsqrte_f32( v1 );
7946 // Normalize
7947 return vmulq_f32( V, vcombine_f32(v2,v2) );
7948#elif defined(_XM_SSE_INTRINSICS_)
7949 // Perform the dot product
7950 XMVECTOR vDot = _mm_mul_ps(V,V);
7951 // x=Dot.y, y=Dot.z
7952 XMVECTOR vTemp = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(2,1,2,1));
7953 // Result.x = x+y
7954 vDot = _mm_add_ss(vDot,vTemp);
7955 // x=Dot.z
7956 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
7957 // Result.x = (x+y)+z
7958 vDot = _mm_add_ss(vDot,vTemp);
7959 // Splat x
7960 vDot = XM_PERMUTE_PS(vDot,_MM_SHUFFLE(0,0,0,0));
7961 // Get the reciprocal
7962 vDot = _mm_rsqrt_ps(vDot);
7963 // Perform the normalization
7964 vDot = _mm_mul_ps(vDot,V);
7965 return vDot;
7966#else // _XM_VMX128_INTRINSICS_
7967#endif // _XM_VMX128_INTRINSICS_
7968}
7969
7970//------------------------------------------------------------------------------
7971
7972inline XMVECTOR XMVector3Normalize
7973(
7974 FXMVECTOR V
7975)
7976{
7977#if defined(_XM_NO_INTRINSICS_)
7978 float fLength;
7979 XMVECTOR vResult;
7980
7981 vResult = XMVector3Length( V );
7982 fLength = vResult.vector4_f32[0];
7983
7984 // Prevent divide by zero
7985 if (fLength > 0) {
7986 fLength = 1.0f/fLength;
7987 }
7988
7989 vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
7990 vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
7991 vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
7992 vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
7993 return vResult;
7994
7995#elif defined(_XM_ARM_NEON_INTRINSICS_)
7996 // Dot3
7997 __n128 vTemp = vmulq_f32( V, V );
7998 __n64 v1 = vget_low_f32( vTemp );
7999 __n64 v2 = vget_high_f32( vTemp );
8000 v1 = vpadd_f32( v1, v1 );
8001 v2 = vdup_lane_f32( v2, 0 );
8002 v1 = vadd_f32( v1, v2 );
8003 __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) );
8004 __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
8005 // Reciprocal sqrt (2 iterations of Newton-Raphson)
8006 __n64 S0 = vrsqrte_f32( v1 );
8007 __n64 P0 = vmul_f32( v1, S0 );
8008 __n64 R0 = vrsqrts_f32( P0, S0 );
8009 __n64 S1 = vmul_f32( S0, R0 );
8010 __n64 P1 = vmul_f32( v1, S1 );
8011 __n64 R1 = vrsqrts_f32( P1, S1 );
8012 v2 = vmul_f32( S1, R1 );
8013 // Normalize
8014 __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
8015 vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
8016 return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
8017#elif defined(_XM_SSE_INTRINSICS_)
8018 // Perform the dot product on x,y and z only
8019 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
8020 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,1,2,1));
8021 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
8022 vTemp = XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(1,1,1,1));
8023 vLengthSq = _mm_add_ss(vLengthSq,vTemp);
8024 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(0,0,0,0));
8025 // Prepare for the division
8026 XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
8027 // Create zero with a single instruction
8028 XMVECTOR vZeroMask = _mm_setzero_ps();
8029 // Test for a divide by zero (Must be FP to detect -0.0)
8030 vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
8031 // Failsafe on zero (Or epsilon) length planes
8032 // If the length is infinity, set the elements to zero
8033 vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
8034 // Divide to perform the normalization
8035 vResult = _mm_div_ps(V,vResult);
8036 // Any that are infinity, set to zero
8037 vResult = _mm_and_ps(vResult,vZeroMask);
8038 // Select qnan or result based on infinite length
8039 XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
8040 XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
8041 vResult = _mm_or_ps(vTemp1,vTemp2);
8042 return vResult;
8043#else // _XM_VMX128_INTRINSICS_
8044#endif // _XM_VMX128_INTRINSICS_
8045}
8046
8047//------------------------------------------------------------------------------
8048
8049inline XMVECTOR XMVector3ClampLength
8050(
8051 FXMVECTOR V,
8052 float LengthMin,
8053 float LengthMax
8054)
8055{
8056 XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
8057 XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
8058
8059 return XMVector3ClampLengthV(V, ClampMin, ClampMax);
8060}
8061
8062//------------------------------------------------------------------------------
8063
8064inline XMVECTOR XMVector3ClampLengthV
8065(
8066 FXMVECTOR V,
8067 FXMVECTOR LengthMin,
8068 FXMVECTOR LengthMax
8069)
8070{
8071 assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)));
8072 assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)));
8073 assert(XMVector3GreaterOrEqual(LengthMin, XMVectorZero()));
8074 assert(XMVector3GreaterOrEqual(LengthMax, XMVectorZero()));
8075 assert(XMVector3GreaterOrEqual(LengthMax, LengthMin));
8076
8077 XMVECTOR LengthSq = XMVector3LengthSq(V);
8078
8079 const XMVECTOR Zero = XMVectorZero();
8080
8081 XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
8082
8083 XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
8084 XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
8085
8086 XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
8087
8088 XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
8089
8090 XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
8091 Length = XMVectorSelect(LengthSq, Length, Select);
8092 Normal = XMVectorSelect(LengthSq, Normal, Select);
8093
8094 XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
8095 XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
8096
8097 XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
8098 ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
8099
8100 XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
8101
8102 // Preserve the original vector (with no precision loss) if the length falls within the given range
8103 XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
8104 Result = XMVectorSelect(Result, V, Control);
8105
8106 return Result;
8107}
8108
8109//------------------------------------------------------------------------------
8110
8111inline XMVECTOR XMVector3Reflect
8112(
8113 FXMVECTOR Incident,
8114 FXMVECTOR Normal
8115)
8116{
8117 // Result = Incident - (2 * dot(Incident, Normal)) * Normal
8118
8119 XMVECTOR Result = XMVector3Dot(Incident, Normal);
8120 Result = XMVectorAdd(Result, Result);
8121 Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
8122
8123 return Result;
8124}
8125
8126//------------------------------------------------------------------------------
8127
8128inline XMVECTOR XMVector3Refract
8129(
8130 FXMVECTOR Incident,
8131 FXMVECTOR Normal,
8132 float RefractionIndex
8133)
8134{
8135 XMVECTOR Index = XMVectorReplicate(RefractionIndex);
8136 return XMVector3RefractV(Incident, Normal, Index);
8137}
8138
8139//------------------------------------------------------------------------------
8140
8141inline XMVECTOR XMVector3RefractV
8142(
8143 FXMVECTOR Incident,
8144 FXMVECTOR Normal,
8145 FXMVECTOR RefractionIndex
8146)
8147{
8148 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
8149 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
8150
8151#if defined(_XM_NO_INTRINSICS_)
8152
8153 const XMVECTOR Zero = XMVectorZero();
8154
8155 XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
8156
8157 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
8158 XMVECTOR R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
8159 R = XMVectorMultiply(R, RefractionIndex);
8160 R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
8161
8162 if (XMVector4LessOrEqual(R, Zero))
8163 {
8164 // Total internal reflection
8165 return Zero;
8166 }
8167 else
8168 {
8169 // R = RefractionIndex * IDotN + sqrt(R)
8170 R = XMVectorSqrt(R);
8171 R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
8172
8173 // Result = RefractionIndex * Incident - Normal * R
8174 XMVECTOR Result = XMVectorMultiply(RefractionIndex, Incident);
8175 Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
8176
8177 return Result;
8178 }
8179
8180#elif defined(_XM_ARM_NEON_INTRINSICS_)
8181 XMVECTOR IDotN = XMVector3Dot(Incident,Normal);
8182
8183 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
8184 __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN);
8185 R = vmulq_f32(R, RefractionIndex);
8186 R = vmlsq_f32(g_XMOne, R, RefractionIndex );
8187
8188 __n128 vResult = vcleq_f32(R,g_XMZero);
8189 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
8190 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
8191 if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
8192 {
8193 // Total internal reflection
8194 vResult = g_XMZero;
8195 }
8196 else
8197 {
8198 // Sqrt(R)
8199 __n128 S0 = vrsqrteq_f32(R);
8200 __n128 P0 = vmulq_f32( R, S0 );
8201 __n128 R0 = vrsqrtsq_f32( P0, S0 );
8202 __n128 S1 = vmulq_f32( S0, R0 );
8203 __n128 P1 = vmulq_f32( R, S1 );
8204 __n128 R1 = vrsqrtsq_f32( P1, S1 );
8205 __n128 S2 = vmulq_f32( S1, R1 );
8206 R = vmulq_f32( R, S2 );
8207 // R = RefractionIndex * IDotN + sqrt(R)
8208 R = vmlaq_f32( R, RefractionIndex, IDotN );
8209 // Result = RefractionIndex * Incident - Normal * R
8210 vResult = vmulq_f32(RefractionIndex, Incident);
8211 vResult = vmlsq_f32( vResult, R, Normal );
8212 }
8213 return vResult;
8214#elif defined(_XM_SSE_INTRINSICS_)
8215 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
8216 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
8217 XMVECTOR IDotN = XMVector3Dot(Incident, Normal);
8218 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
8219 XMVECTOR R = _mm_mul_ps(IDotN, IDotN);
8220 R = _mm_sub_ps(g_XMOne,R);
8221 R = _mm_mul_ps(R, RefractionIndex);
8222 R = _mm_mul_ps(R, RefractionIndex);
8223 R = _mm_sub_ps(g_XMOne,R);
8224
8225 XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
8226 if (_mm_movemask_ps(vResult)==0x0f)
8227 {
8228 // Total internal reflection
8229 vResult = g_XMZero;
8230 }
8231 else
8232 {
8233 // R = RefractionIndex * IDotN + sqrt(R)
8234 R = _mm_sqrt_ps(R);
8235 vResult = _mm_mul_ps(RefractionIndex,IDotN);
8236 R = _mm_add_ps(R,vResult);
8237 // Result = RefractionIndex * Incident - Normal * R
8238 vResult = _mm_mul_ps(RefractionIndex, Incident);
8239 R = _mm_mul_ps(R,Normal);
8240 vResult = _mm_sub_ps(vResult,R);
8241 }
8242 return vResult;
8243#else // _XM_VMX128_INTRINSICS_
8244#endif // _XM_VMX128_INTRINSICS_
8245}
8246
8247//------------------------------------------------------------------------------
8248
8249inline XMVECTOR XMVector3Orthogonal
8250(
8251 FXMVECTOR V
8252)
8253{
8254#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8255
8256 XMVECTOR Zero = XMVectorZero();
8257 XMVECTOR Z = XMVectorSplatZ(V);
8258 XMVECTOR YZYY = XMVectorSwizzle<XM_SWIZZLE_Y, XM_SWIZZLE_Z, XM_SWIZZLE_Y, XM_SWIZZLE_Y>(V);
8259
8260 XMVECTOR NegativeV = XMVectorSubtract(Zero, V);
8261
8262 XMVECTOR ZIsNegative = XMVectorLess(Z, Zero);
8263 XMVECTOR YZYYIsNegative = XMVectorLess(YZYY, Zero);
8264
8265 XMVECTOR S = XMVectorAdd(YZYY, Z);
8266 XMVECTOR D = XMVectorSubtract(YZYY, Z);
8267
8268 XMVECTOR Select = XMVectorEqualInt(ZIsNegative, YZYYIsNegative);
8269
8270 XMVECTOR R0 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(NegativeV, S);
8271 XMVECTOR R1 = XMVectorPermute<XM_PERMUTE_1X, XM_PERMUTE_0X, XM_PERMUTE_0X, XM_PERMUTE_0X>(V, D);
8272
8273 return XMVectorSelect(R1, R0, Select);
8274
8275#else // _XM_VMX128_INTRINSICS_
8276#endif // _XM_VMX128_INTRINSICS_
8277}
8278
8279//------------------------------------------------------------------------------
8280
8281inline XMVECTOR XMVector3AngleBetweenNormalsEst
8282(
8283 FXMVECTOR N1,
8284 FXMVECTOR N2
8285)
8286{
8287#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8288
8289 XMVECTOR Result = XMVector3Dot(N1, N2);
8290 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
8291 Result = XMVectorACosEst(Result);
8292 return Result;
8293
8294#else // _XM_VMX128_INTRINSICS_
8295#endif // _XM_VMX128_INTRINSICS_
8296}
8297
8298//------------------------------------------------------------------------------
8299
8300inline XMVECTOR XMVector3AngleBetweenNormals
8301(
8302 FXMVECTOR N1,
8303 FXMVECTOR N2
8304)
8305{
8306#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8307
8308 XMVECTOR Result = XMVector3Dot(N1, N2);
8309 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
8310 Result = XMVectorACos(Result);
8311 return Result;
8312
8313#else // _XM_VMX128_INTRINSICS_
8314#endif // _XM_VMX128_INTRINSICS_
8315}
8316
8317//------------------------------------------------------------------------------
8318
8319inline XMVECTOR XMVector3AngleBetweenVectors
8320(
8321 FXMVECTOR V1,
8322 FXMVECTOR V2
8323)
8324{
8325#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8326
8327 XMVECTOR L1 = XMVector3ReciprocalLength(V1);
8328 XMVECTOR L2 = XMVector3ReciprocalLength(V2);
8329
8330 XMVECTOR Dot = XMVector3Dot(V1, V2);
8331
8332 L1 = XMVectorMultiply(L1, L2);
8333
8334 XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
8335 CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
8336
8337 return XMVectorACos(CosAngle);
8338
8339#else // _XM_VMX128_INTRINSICS_
8340#endif // _XM_VMX128_INTRINSICS_
8341}
8342
8343//------------------------------------------------------------------------------
8344
8345inline XMVECTOR XMVector3LinePointDistance
8346(
8347 FXMVECTOR LinePoint1,
8348 FXMVECTOR LinePoint2,
8349 FXMVECTOR Point
8350)
8351{
8352 // Given a vector PointVector from LinePoint1 to Point and a vector
8353 // LineVector from LinePoint1 to LinePoint2, the scaled distance
8354 // PointProjectionScale from LinePoint1 to the perpendicular projection
8355 // of PointVector onto the line is defined as:
8356 //
8357 // PointProjectionScale = dot(PointVector, LineVector) / LengthSq(LineVector)
8358
8359#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8360
8361 XMVECTOR PointVector = XMVectorSubtract(Point, LinePoint1);
8362 XMVECTOR LineVector = XMVectorSubtract(LinePoint2, LinePoint1);
8363
8364 XMVECTOR LengthSq = XMVector3LengthSq(LineVector);
8365
8366 XMVECTOR PointProjectionScale = XMVector3Dot(PointVector, LineVector);
8367 PointProjectionScale = XMVectorDivide(PointProjectionScale, LengthSq);
8368
8369 XMVECTOR DistanceVector = XMVectorMultiply(LineVector, PointProjectionScale);
8370 DistanceVector = XMVectorSubtract(PointVector, DistanceVector);
8371
8372 return XMVector3Length(DistanceVector);
8373
8374#else // _XM_VMX128_INTRINSICS_
8375#endif // _XM_VMX128_INTRINSICS_
8376}
8377
8378//------------------------------------------------------------------------------
8379
8380_Use_decl_annotations_
8381inline void XMVector3ComponentsFromNormal
8382(
8383 XMVECTOR* pParallel,
8384 XMVECTOR* pPerpendicular,
8385 FXMVECTOR V,
8386 FXMVECTOR Normal
8387)
8388{
8389 assert(pParallel != NULL);
8390 assert(pPerpendicular != NULL);
8391
8392 XMVECTOR Scale = XMVector3Dot(V, Normal);
8393
8394 XMVECTOR Parallel = XMVectorMultiply(Normal, Scale);
8395
8396 *pParallel = Parallel;
8397 *pPerpendicular = XMVectorSubtract(V, Parallel);
8398}
8399
8400//------------------------------------------------------------------------------
8401// Transform a vector using a rotation expressed as a unit quaternion
8402
8403inline XMVECTOR XMVector3Rotate
8404(
8405 FXMVECTOR V,
8406 FXMVECTOR RotationQuaternion
8407)
8408{
8409#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8410
8411 XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
8412 XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
8413 XMVECTOR Result = XMQuaternionMultiply(Q, A);
8414 return XMQuaternionMultiply(Result, RotationQuaternion);
8415
8416#else // _XM_VMX128_INTRINSICS_
8417#endif // _XM_VMX128_INTRINSICS_
8418}
8419
8420//------------------------------------------------------------------------------
8421// Transform a vector using the inverse of a rotation expressed as a unit quaternion
8422
8423inline XMVECTOR XMVector3InverseRotate
8424(
8425 FXMVECTOR V,
8426 FXMVECTOR RotationQuaternion
8427)
8428{
8429#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8430
8431 XMVECTOR A = XMVectorSelect(g_XMSelect1110.v, V, g_XMSelect1110.v);
8432 XMVECTOR Result = XMQuaternionMultiply(RotationQuaternion, A);
8433 XMVECTOR Q = XMQuaternionConjugate(RotationQuaternion);
8434 return XMQuaternionMultiply(Result, Q);
8435
8436#else // _XM_VMX128_INTRINSICS_
8437#endif // _XM_VMX128_INTRINSICS_
8438}
8439
8440//------------------------------------------------------------------------------
8441
8442inline XMVECTOR XMVector3Transform
8443(
8444 FXMVECTOR V,
8445 CXMMATRIX M
8446)
8447{
8448#if defined(_XM_NO_INTRINSICS_)
8449
8450 XMVECTOR Z = XMVectorSplatZ(V);
8451 XMVECTOR Y = XMVectorSplatY(V);
8452 XMVECTOR X = XMVectorSplatX(V);
8453
8454 XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
8455 Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
8456 Result = XMVectorMultiplyAdd(X, M.r[0], Result);
8457
8458 return Result;
8459
8460#elif defined(_XM_ARM_NEON_INTRINSICS_)
8461 __n64 VL = vget_low_f32( V );
8462 XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X
8463 XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y
8464 vResult = vmlaq_f32( M.r[3], vResult, M.r[0] );
8465 vResult = vmlaq_f32( vResult, vTemp, M.r[1] );
8466 vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z
8467 return vmlaq_f32( vResult, vTemp, M.r[2] );
8468#elif defined(_XM_SSE_INTRINSICS_)
8469 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
8470 vResult = _mm_mul_ps(vResult,M.r[0]);
8471 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
8472 vTemp = _mm_mul_ps(vTemp,M.r[1]);
8473 vResult = _mm_add_ps(vResult,vTemp);
8474 vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
8475 vTemp = _mm_mul_ps(vTemp,M.r[2]);
8476 vResult = _mm_add_ps(vResult,vTemp);
8477 vResult = _mm_add_ps(vResult,M.r[3]);
8478 return vResult;
8479#else // _XM_VMX128_INTRINSICS_
8480#endif // _XM_VMX128_INTRINSICS_
8481}
8482
8483//------------------------------------------------------------------------------
8484
8485_Use_decl_annotations_
8486inline XMFLOAT4* XMVector3TransformStream
8487(
8488 XMFLOAT4* pOutputStream,
8489 size_t OutputStride,
8490 const XMFLOAT3* pInputStream,
8491 size_t InputStride,
8492 size_t VectorCount,
8493 CXMMATRIX M
8494)
8495{
8496 assert(pOutputStream != NULL);
8497 assert(pInputStream != NULL);
8498
8499#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8500
8501 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
8502 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
8503
8504 const XMVECTOR row0 = M.r[0];
8505 const XMVECTOR row1 = M.r[1];
8506 const XMVECTOR row2 = M.r[2];
8507 const XMVECTOR row3 = M.r[3];
8508
8509 for (size_t i = 0; i < VectorCount; i++)
8510 {
8511 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
8512 XMVECTOR Z = XMVectorSplatZ(V);
8513 XMVECTOR Y = XMVectorSplatY(V);
8514 XMVECTOR X = XMVectorSplatX(V);
8515
8516 XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
8517 Result = XMVectorMultiplyAdd(Y, row1, Result);
8518 Result = XMVectorMultiplyAdd(X, row0, Result);
8519
8520 XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
8521
8522 pInputVector += InputStride;
8523 pOutputVector += OutputStride;
8524 }
8525
8526 return pOutputStream;
8527
8528#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
8529#endif // _XM_VMX128_INTRINSICS_
8530}
8531
8532
8533//------------------------------------------------------------------------------
8534
8535inline XMVECTOR XMVector3TransformCoord
8536(
8537 FXMVECTOR V,
8538 CXMMATRIX M
8539)
8540{
8541#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8542
8543 XMVECTOR Z = XMVectorSplatZ(V);
8544 XMVECTOR Y = XMVectorSplatY(V);
8545 XMVECTOR X = XMVectorSplatX(V);
8546
8547 XMVECTOR Result = XMVectorMultiplyAdd(Z, M.r[2], M.r[3]);
8548 Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
8549 Result = XMVectorMultiplyAdd(X, M.r[0], Result);
8550
8551 XMVECTOR W = XMVectorSplatW(Result);
8552 return XMVectorDivide( Result, W );
8553
8554#else // _XM_VMX128_INTRINSICS_
8555#endif // _XM_VMX128_INTRINSICS_
8556}
8557
8558//------------------------------------------------------------------------------
8559
8560_Use_decl_annotations_
8561inline XMFLOAT3* XMVector3TransformCoordStream
8562(
8563 XMFLOAT3* pOutputStream,
8564 size_t OutputStride,
8565 const XMFLOAT3* pInputStream,
8566 size_t InputStride,
8567 size_t VectorCount,
8568 CXMMATRIX M
8569)
8570{
8571 assert(pOutputStream != NULL);
8572 assert(pInputStream != NULL);
8573
8574#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8575
8576 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
8577 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
8578
8579 const XMVECTOR row0 = M.r[0];
8580 const XMVECTOR row1 = M.r[1];
8581 const XMVECTOR row2 = M.r[2];
8582 const XMVECTOR row3 = M.r[3];
8583
8584 for (size_t i = 0; i < VectorCount; i++)
8585 {
8586 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
8587 XMVECTOR Z = XMVectorSplatZ(V);
8588 XMVECTOR Y = XMVectorSplatY(V);
8589 XMVECTOR X = XMVectorSplatX(V);
8590
8591 XMVECTOR Result = XMVectorMultiplyAdd(Z, row2, row3);
8592 Result = XMVectorMultiplyAdd(Y, row1, Result);
8593 Result = XMVectorMultiplyAdd(X, row0, Result);
8594
8595 XMVECTOR W = XMVectorSplatW(Result);
8596
8597 Result = XMVectorDivide(Result, W);
8598
8599 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
8600
8601 pInputVector += InputStride;
8602 pOutputVector += OutputStride;
8603 }
8604
8605 return pOutputStream;
8606
8607#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
8608#endif // _XM_VMX128_INTRINSICS_
8609}
8610
8611//------------------------------------------------------------------------------
8612
8613inline XMVECTOR XMVector3TransformNormal
8614(
8615 FXMVECTOR V,
8616 CXMMATRIX M
8617)
8618{
8619#if defined(_XM_NO_INTRINSICS_)
8620
8621 XMVECTOR Z = XMVectorSplatZ(V);
8622 XMVECTOR Y = XMVectorSplatY(V);
8623 XMVECTOR X = XMVectorSplatX(V);
8624
8625 XMVECTOR Result = XMVectorMultiply(Z, M.r[2]);
8626 Result = XMVectorMultiplyAdd(Y, M.r[1], Result);
8627 Result = XMVectorMultiplyAdd(X, M.r[0], Result);
8628
8629 return Result;
8630
8631#elif defined(_XM_ARM_NEON_INTRINSICS_)
8632 __n64 VL = vget_low_f32( V );
8633 XMVECTOR vResult = vdupq_lane_f32( VL, 0 ); // X
8634 XMVECTOR vTemp = vdupq_lane_f32( VL, 1 ); // Y
8635 vResult = vmulq_f32( vResult, M.r[0] );
8636 vResult = vmlaq_f32( vResult, vTemp, M.r[1] );
8637 vTemp = vdupq_lane_f32( vget_high_f32( V ), 0 ); // Z
8638 return vmlaq_f32( vResult, vTemp, M.r[2] );
8639#elif defined(_XM_SSE_INTRINSICS_)
8640 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
8641 vResult = _mm_mul_ps(vResult,M.r[0]);
8642 XMVECTOR vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
8643 vTemp = _mm_mul_ps(vTemp,M.r[1]);
8644 vResult = _mm_add_ps(vResult,vTemp);
8645 vTemp = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
8646 vTemp = _mm_mul_ps(vTemp,M.r[2]);
8647 vResult = _mm_add_ps(vResult,vTemp);
8648 return vResult;
8649#else // _XM_VMX128_INTRINSICS_
8650#endif // _XM_VMX128_INTRINSICS_
8651}
8652
8653//------------------------------------------------------------------------------
8654
8655_Use_decl_annotations_
8656inline XMFLOAT3* XMVector3TransformNormalStream
8657(
8658 XMFLOAT3* pOutputStream,
8659 size_t OutputStride,
8660 const XMFLOAT3* pInputStream,
8661 size_t InputStride,
8662 size_t VectorCount,
8663 CXMMATRIX M
8664)
8665{
8666 assert(pOutputStream != NULL);
8667 assert(pInputStream != NULL);
8668
8669#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8670
8671 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
8672 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
8673
8674 const XMVECTOR row0 = M.r[0];
8675 const XMVECTOR row1 = M.r[1];
8676 const XMVECTOR row2 = M.r[2];
8677
8678 for (size_t i = 0; i < VectorCount; i++)
8679 {
8680 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
8681 XMVECTOR Z = XMVectorSplatZ(V);
8682 XMVECTOR Y = XMVectorSplatY(V);
8683 XMVECTOR X = XMVectorSplatX(V);
8684
8685 XMVECTOR Result = XMVectorMultiply(Z, row2);
8686 Result = XMVectorMultiplyAdd(Y, row1, Result);
8687 Result = XMVectorMultiplyAdd(X, row0, Result);
8688
8689 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
8690
8691 pInputVector += InputStride;
8692 pOutputVector += OutputStride;
8693 }
8694
8695 return pOutputStream;
8696
8697#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
8698#endif // _XM_VMX128_INTRINSICS_
8699}
8700
8701//------------------------------------------------------------------------------
8702
8703inline XMVECTOR XMVector3Project
8704(
8705 FXMVECTOR V,
8706 float ViewportX,
8707 float ViewportY,
8708 float ViewportWidth,
8709 float ViewportHeight,
8710 float ViewportMinZ,
8711 float ViewportMaxZ,
8712 CXMMATRIX Projection,
8713 CXMMATRIX View,
8714 CXMMATRIX World
8715)
8716{
8717#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8718
8719 const float HalfViewportWidth = ViewportWidth * 0.5f;
8720 const float HalfViewportHeight = ViewportHeight * 0.5f;
8721
8722 XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 0.0f);
8723 XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
8724
8725 XMMATRIX Transform = XMMatrixMultiply(World, View);
8726 Transform = XMMatrixMultiply(Transform, Projection);
8727
8728 XMVECTOR Result = XMVector3TransformCoord(V, Transform);
8729
8730 Result = XMVectorMultiplyAdd(Result, Scale, Offset);
8731
8732 return Result;
8733
8734#else // _XM_VMX128_INTRINSICS_
8735#endif // _XM_VMX128_INTRINSICS_
8736}
8737
8738//------------------------------------------------------------------------------
8739
8740_Use_decl_annotations_
8741inline XMFLOAT3* XMVector3ProjectStream
8742(
8743 XMFLOAT3* pOutputStream,
8744 size_t OutputStride,
8745 const XMFLOAT3* pInputStream,
8746 size_t InputStride,
8747 size_t VectorCount,
8748 float ViewportX,
8749 float ViewportY,
8750 float ViewportWidth,
8751 float ViewportHeight,
8752 float ViewportMinZ,
8753 float ViewportMaxZ,
8754 CXMMATRIX Projection,
8755 CXMMATRIX View,
8756 CXMMATRIX World
8757)
8758{
8759 assert(pOutputStream != NULL);
8760 assert(pInputStream != NULL);
8761
8762#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
8763
8764 const float HalfViewportWidth = ViewportWidth * 0.5f;
8765 const float HalfViewportHeight = ViewportHeight * 0.5f;
8766
8767 XMVECTOR Scale = XMVectorSet(HalfViewportWidth, -HalfViewportHeight, ViewportMaxZ - ViewportMinZ, 1.0f);
8768 XMVECTOR Offset = XMVectorSet(ViewportX + HalfViewportWidth, ViewportY + HalfViewportHeight, ViewportMinZ, 0.0f);
8769
8770 XMMATRIX Transform = XMMatrixMultiply(World, View);
8771 Transform = XMMatrixMultiply(Transform, Projection);
8772
8773 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
8774 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
8775
8776 for (size_t i = 0; i < VectorCount; i++)
8777 {
8778 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
8779
8780 XMVECTOR Result = XMVector3TransformCoord(V, Transform);
8781 Result = XMVectorMultiplyAdd(Result, Scale, Offset);
8782
8783 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
8784
8785 pInputVector += InputStride;
8786 pOutputVector += OutputStride;
8787 }
8788
8789 return pOutputStream;
8790
8791#else // _XM_VMX128_INTRINSICS_
8792#endif // _XM_VMX128_INTRINSICS_
8793}
8794
8795//------------------------------------------------------------------------------
8796
8797inline XMVECTOR XMVector3Unproject
8798(
8799 FXMVECTOR V,
8800 float ViewportX,
8801 float ViewportY,
8802 float ViewportWidth,
8803 float ViewportHeight,
8804 float ViewportMinZ,
8805 float ViewportMaxZ,
8806 CXMMATRIX Projection,
8807 CXMMATRIX View,
8808 CXMMATRIX World
8809)
8810{
8811#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
8812
8813 static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
8814
8815 XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
8816 Scale = XMVectorReciprocal(Scale);
8817
8818 XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
8819 Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
8820
8821 XMMATRIX Transform = XMMatrixMultiply(World, View);
8822 Transform = XMMatrixMultiply(Transform, Projection);
8823 Transform = XMMatrixInverse(NULL, Transform);
8824
8825 XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
8826
8827 return XMVector3TransformCoord(Result, Transform);
8828
8829#else // _XM_VMX128_INTRINSICS_
8830#endif // _XM_VMX128_INTRINSICS_
8831}
8832
8833//------------------------------------------------------------------------------
8834
8835_Use_decl_annotations_
8836inline XMFLOAT3* XMVector3UnprojectStream
8837(
8838 XMFLOAT3* pOutputStream,
8839 size_t OutputStride,
8840 const XMFLOAT3* pInputStream,
8841 size_t InputStride,
8842 size_t VectorCount,
8843 float ViewportX,
8844 float ViewportY,
8845 float ViewportWidth,
8846 float ViewportHeight,
8847 float ViewportMinZ,
8848 float ViewportMaxZ,
8849 CXMMATRIX Projection,
8850 CXMMATRIX View,
8851 CXMMATRIX World)
8852{
8853 assert(pOutputStream != NULL);
8854 assert(pInputStream != NULL);
8855
8856#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_)
8857
8858 static const XMVECTORF32 D = { -1.0f, 1.0f, 0.0f, 0.0f };
8859
8860 XMVECTOR Scale = XMVectorSet(ViewportWidth * 0.5f, -ViewportHeight * 0.5f, ViewportMaxZ - ViewportMinZ, 1.0f);
8861 Scale = XMVectorReciprocal(Scale);
8862
8863 XMVECTOR Offset = XMVectorSet(-ViewportX, -ViewportY, -ViewportMinZ, 0.0f);
8864 Offset = XMVectorMultiplyAdd(Scale, Offset, D.v);
8865
8866 XMMATRIX Transform = XMMatrixMultiply(World, View);
8867 Transform = XMMatrixMultiply(Transform, Projection);
8868 Transform = XMMatrixInverse(NULL, Transform);
8869
8870 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
8871 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
8872
8873 for (size_t i = 0; i < VectorCount; i++)
8874 {
8875 XMVECTOR V = XMLoadFloat3((const XMFLOAT3*)pInputVector);
8876
8877 XMVECTOR Result = XMVectorMultiplyAdd(V, Scale, Offset);
8878
8879 Result = XMVector3TransformCoord(Result, Transform);
8880
8881 XMStoreFloat3((XMFLOAT3*)pOutputVector, Result);
8882
8883 pInputVector += InputStride;
8884 pOutputVector += OutputStride;
8885 }
8886
8887 return pOutputStream;
8888
8889#else // _XM_VMX128_INTRINSICS_
8890#endif // _XM_VMX128_INTRINSICS_
8891}
8892
8893/****************************************************************************
8894 *
8895 * 4D Vector
8896 *
8897 ****************************************************************************/
8898
8899//------------------------------------------------------------------------------
8900// Comparison operations
8901//------------------------------------------------------------------------------
8902
8903//------------------------------------------------------------------------------
8904
8905inline bool XMVector4Equal
8906(
8907 FXMVECTOR V1,
8908 FXMVECTOR V2
8909)
8910{
8911#if defined(_XM_NO_INTRINSICS_)
8912 return (((V1.vector4_f32[0] == V2.vector4_f32[0]) && (V1.vector4_f32[1] == V2.vector4_f32[1]) && (V1.vector4_f32[2] == V2.vector4_f32[2]) && (V1.vector4_f32[3] == V2.vector4_f32[3])) != 0);
8913#elif defined(_XM_ARM_NEON_INTRINSICS_)
8914 __n128 vResult = vceqq_f32( V1, V2 );
8915 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
8916 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
8917 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
8918#elif defined(_XM_SSE_INTRINSICS_)
8919 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
8920 return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
8921#else
8922 return XMComparisonAllTrue(XMVector4EqualR(V1, V2));
8923#endif
8924}
8925
8926//------------------------------------------------------------------------------
8927
8928inline uint32_t XMVector4EqualR
8929(
8930 FXMVECTOR V1,
8931 FXMVECTOR V2
8932)
8933{
8934#if defined(_XM_NO_INTRINSICS_)
8935
8936 uint32_t CR = 0;
8937
8938 if ((V1.vector4_f32[0] == V2.vector4_f32[0]) &&
8939 (V1.vector4_f32[1] == V2.vector4_f32[1]) &&
8940 (V1.vector4_f32[2] == V2.vector4_f32[2]) &&
8941 (V1.vector4_f32[3] == V2.vector4_f32[3]))
8942 {
8943 CR = XM_CRMASK_CR6TRUE;
8944 }
8945 else if ((V1.vector4_f32[0] != V2.vector4_f32[0]) &&
8946 (V1.vector4_f32[1] != V2.vector4_f32[1]) &&
8947 (V1.vector4_f32[2] != V2.vector4_f32[2]) &&
8948 (V1.vector4_f32[3] != V2.vector4_f32[3]))
8949 {
8950 CR = XM_CRMASK_CR6FALSE;
8951 }
8952 return CR;
8953
8954#elif defined(_XM_ARM_NEON_INTRINSICS_)
8955 __n128 vResult = vceqq_f32( V1, V2 );
8956 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
8957 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
8958 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
8959
8960 uint32_t CR = 0;
8961 if ( r == 0xFFFFFFFFU )
8962 {
8963 CR = XM_CRMASK_CR6TRUE;
8964 }
8965 else if ( !r )
8966 {
8967 CR = XM_CRMASK_CR6FALSE;
8968 }
8969 return CR;
8970#elif defined(_XM_SSE_INTRINSICS_)
8971 XMVECTOR vTemp = _mm_cmpeq_ps(V1,V2);
8972 int iTest = _mm_movemask_ps(vTemp);
8973 uint32_t CR = 0;
8974 if (iTest==0xf) // All equal?
8975 {
8976 CR = XM_CRMASK_CR6TRUE;
8977 }
8978 else if (iTest==0) // All not equal?
8979 {
8980 CR = XM_CRMASK_CR6FALSE;
8981 }
8982 return CR;
8983#else // _XM_VMX128_INTRINSICS_
8984#endif // _XM_VMX128_INTRINSICS_
8985}
8986
8987//------------------------------------------------------------------------------
8988
8989inline bool XMVector4EqualInt
8990(
8991 FXMVECTOR V1,
8992 FXMVECTOR V2
8993)
8994{
8995#if defined(_XM_NO_INTRINSICS_)
8996 return (((V1.vector4_u32[0] == V2.vector4_u32[0]) && (V1.vector4_u32[1] == V2.vector4_u32[1]) && (V1.vector4_u32[2] == V2.vector4_u32[2]) && (V1.vector4_u32[3] == V2.vector4_u32[3])) != 0);
8997#elif defined(_XM_ARM_NEON_INTRINSICS_)
8998 __n128 vResult = vceqq_u32( V1, V2 );
8999 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9000 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9001 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
9002#elif defined(_XM_SSE_INTRINSICS_)
9003 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
9004 return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))==0xf) != 0);
9005#else
9006 return XMComparisonAllTrue(XMVector4EqualIntR(V1, V2));
9007#endif
9008}
9009
9010//------------------------------------------------------------------------------
9011
9012inline uint32_t XMVector4EqualIntR
9013(
9014 FXMVECTOR V1,
9015 FXMVECTOR V2
9016)
9017{
9018#if defined(_XM_NO_INTRINSICS_)
9019 uint32_t CR = 0;
9020 if (V1.vector4_u32[0] == V2.vector4_u32[0] &&
9021 V1.vector4_u32[1] == V2.vector4_u32[1] &&
9022 V1.vector4_u32[2] == V2.vector4_u32[2] &&
9023 V1.vector4_u32[3] == V2.vector4_u32[3])
9024 {
9025 CR = XM_CRMASK_CR6TRUE;
9026 }
9027 else if (V1.vector4_u32[0] != V2.vector4_u32[0] &&
9028 V1.vector4_u32[1] != V2.vector4_u32[1] &&
9029 V1.vector4_u32[2] != V2.vector4_u32[2] &&
9030 V1.vector4_u32[3] != V2.vector4_u32[3])
9031 {
9032 CR = XM_CRMASK_CR6FALSE;
9033 }
9034 return CR;
9035
9036#elif defined(_XM_ARM_NEON_INTRINSICS_)
9037 __n128 vResult = vceqq_u32( V1, V2 );
9038 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9039 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9040 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
9041
9042 uint32_t CR = 0;
9043 if ( r == 0xFFFFFFFFU )
9044 {
9045 CR = XM_CRMASK_CR6TRUE;
9046 }
9047 else if ( !r )
9048 {
9049 CR = XM_CRMASK_CR6FALSE;
9050 }
9051 return CR;
9052#elif defined(_XM_SSE_INTRINSICS_)
9053 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
9054 int iTest = _mm_movemask_ps(_mm_castsi128_ps(vTemp));
9055 uint32_t CR = 0;
9056 if (iTest==0xf) // All equal?
9057 {
9058 CR = XM_CRMASK_CR6TRUE;
9059 }
9060 else if (iTest==0) // All not equal?
9061 {
9062 CR = XM_CRMASK_CR6FALSE;
9063 }
9064 return CR;
9065#else // _XM_VMX128_INTRINSICS_
9066#endif // _XM_VMX128_INTRINSICS_
9067}
9068
9069inline bool XMVector4NearEqual
9070(
9071 FXMVECTOR V1,
9072 FXMVECTOR V2,
9073 FXMVECTOR Epsilon
9074)
9075{
9076#if defined(_XM_NO_INTRINSICS_)
9077 float dx, dy, dz, dw;
9078
9079 dx = fabsf(V1.vector4_f32[0]-V2.vector4_f32[0]);
9080 dy = fabsf(V1.vector4_f32[1]-V2.vector4_f32[1]);
9081 dz = fabsf(V1.vector4_f32[2]-V2.vector4_f32[2]);
9082 dw = fabsf(V1.vector4_f32[3]-V2.vector4_f32[3]);
9083 return (((dx <= Epsilon.vector4_f32[0]) &&
9084 (dy <= Epsilon.vector4_f32[1]) &&
9085 (dz <= Epsilon.vector4_f32[2]) &&
9086 (dw <= Epsilon.vector4_f32[3])) != 0);
9087#elif defined(_XM_ARM_NEON_INTRINSICS_)
9088 __n128 vDelta = vsubq_f32( V1, V2 );
9089 __n128 vResult = vacleq_f32( vDelta, Epsilon );
9090 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9091 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9092 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
9093#elif defined(_XM_SSE_INTRINSICS_)
9094 // Get the difference
9095 XMVECTOR vDelta = _mm_sub_ps(V1,V2);
9096 // Get the absolute value of the difference
9097 XMVECTOR vTemp = _mm_setzero_ps();
9098 vTemp = _mm_sub_ps(vTemp,vDelta);
9099 vTemp = _mm_max_ps(vTemp,vDelta);
9100 vTemp = _mm_cmple_ps(vTemp,Epsilon);
9101 return ((_mm_movemask_ps(vTemp)==0xf) != 0);
9102#else // _XM_VMX128_INTRINSICS_
9103#endif // _XM_VMX128_INTRINSICS_
9104}
9105
9106//------------------------------------------------------------------------------
9107
9108inline bool XMVector4NotEqual
9109(
9110 FXMVECTOR V1,
9111 FXMVECTOR V2
9112)
9113{
9114#if defined(_XM_NO_INTRINSICS_)
9115 return (((V1.vector4_f32[0] != V2.vector4_f32[0]) || (V1.vector4_f32[1] != V2.vector4_f32[1]) || (V1.vector4_f32[2] != V2.vector4_f32[2]) || (V1.vector4_f32[3] != V2.vector4_f32[3])) != 0);
9116#elif defined(_XM_ARM_NEON_INTRINSICS_)
9117 __n128 vResult = vceqq_f32( V1, V2 );
9118 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9119 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9120 return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
9121#elif defined(_XM_SSE_INTRINSICS_)
9122 XMVECTOR vTemp = _mm_cmpneq_ps(V1,V2);
9123 return ((_mm_movemask_ps(vTemp)) != 0);
9124#else
9125 return XMComparisonAnyFalse(XMVector4EqualR(V1, V2));
9126#endif
9127}
9128
9129//------------------------------------------------------------------------------
9130
9131inline bool XMVector4NotEqualInt
9132(
9133 FXMVECTOR V1,
9134 FXMVECTOR V2
9135)
9136{
9137#if defined(_XM_NO_INTRINSICS_)
9138 return (((V1.vector4_u32[0] != V2.vector4_u32[0]) || (V1.vector4_u32[1] != V2.vector4_u32[1]) || (V1.vector4_u32[2] != V2.vector4_u32[2]) || (V1.vector4_u32[3] != V2.vector4_u32[3])) != 0);
9139#elif defined(_XM_ARM_NEON_INTRINSICS_)
9140 __n128 vResult = vceqq_u32( V1, V2 );
9141 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9142 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9143 return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
9144#elif defined(_XM_SSE_INTRINSICS_)
9145 __m128i vTemp = _mm_cmpeq_epi32(_mm_castps_si128(V1),_mm_castps_si128(V2));
9146 return ((_mm_movemask_ps(_mm_castsi128_ps(vTemp))!=0xF) != 0);
9147#else
9148 return XMComparisonAnyFalse(XMVector4EqualIntR(V1, V2));
9149#endif
9150}
9151
9152//------------------------------------------------------------------------------
9153
9154inline bool XMVector4Greater
9155(
9156 FXMVECTOR V1,
9157 FXMVECTOR V2
9158)
9159{
9160#if defined(_XM_NO_INTRINSICS_)
9161 return (((V1.vector4_f32[0] > V2.vector4_f32[0]) && (V1.vector4_f32[1] > V2.vector4_f32[1]) && (V1.vector4_f32[2] > V2.vector4_f32[2]) && (V1.vector4_f32[3] > V2.vector4_f32[3])) != 0);
9162#elif defined(_XM_ARM_NEON_INTRINSICS_)
9163 __n128 vResult = vcgtq_f32( V1, V2 );
9164 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9165 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9166 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
9167#elif defined(_XM_SSE_INTRINSICS_)
9168 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
9169 return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
9170#else
9171 return XMComparisonAllTrue(XMVector4GreaterR(V1, V2));
9172#endif
9173}
9174
9175//------------------------------------------------------------------------------
9176
9177inline uint32_t XMVector4GreaterR
9178(
9179 FXMVECTOR V1,
9180 FXMVECTOR V2
9181)
9182{
9183#if defined(_XM_NO_INTRINSICS_)
9184 uint32_t CR = 0;
9185 if (V1.vector4_f32[0] > V2.vector4_f32[0] &&
9186 V1.vector4_f32[1] > V2.vector4_f32[1] &&
9187 V1.vector4_f32[2] > V2.vector4_f32[2] &&
9188 V1.vector4_f32[3] > V2.vector4_f32[3])
9189 {
9190 CR = XM_CRMASK_CR6TRUE;
9191 }
9192 else if (V1.vector4_f32[0] <= V2.vector4_f32[0] &&
9193 V1.vector4_f32[1] <= V2.vector4_f32[1] &&
9194 V1.vector4_f32[2] <= V2.vector4_f32[2] &&
9195 V1.vector4_f32[3] <= V2.vector4_f32[3])
9196 {
9197 CR = XM_CRMASK_CR6FALSE;
9198 }
9199 return CR;
9200
9201#elif defined(_XM_ARM_NEON_INTRINSICS_)
9202 __n128 vResult = vcgtq_f32( V1, V2 );
9203 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9204 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9205 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
9206
9207 uint32_t CR = 0;
9208 if ( r == 0xFFFFFFFFU )
9209 {
9210 CR = XM_CRMASK_CR6TRUE;
9211 }
9212 else if ( !r )
9213 {
9214 CR = XM_CRMASK_CR6FALSE;
9215 }
9216 return CR;
9217#elif defined(_XM_SSE_INTRINSICS_)
9218 uint32_t CR = 0;
9219 XMVECTOR vTemp = _mm_cmpgt_ps(V1,V2);
9220 int iTest = _mm_movemask_ps(vTemp);
9221 if (iTest==0xf) {
9222 CR = XM_CRMASK_CR6TRUE;
9223 }
9224 else if (!iTest)
9225 {
9226 CR = XM_CRMASK_CR6FALSE;
9227 }
9228 return CR;
9229#else // _XM_VMX128_INTRINSICS_
9230#endif // _XM_VMX128_INTRINSICS_
9231}
9232
9233//------------------------------------------------------------------------------
9234
9235inline bool XMVector4GreaterOrEqual
9236(
9237 FXMVECTOR V1,
9238 FXMVECTOR V2
9239)
9240{
9241#if defined(_XM_NO_INTRINSICS_)
9242 return (((V1.vector4_f32[0] >= V2.vector4_f32[0]) && (V1.vector4_f32[1] >= V2.vector4_f32[1]) && (V1.vector4_f32[2] >= V2.vector4_f32[2]) && (V1.vector4_f32[3] >= V2.vector4_f32[3])) != 0);
9243#elif defined(_XM_ARM_NEON_INTRINSICS_)
9244 __n128 vResult = vcgeq_f32( V1, V2 );
9245 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9246 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9247 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
9248#elif defined(_XM_SSE_INTRINSICS_)
9249 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
9250 return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
9251#else
9252 return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V1, V2));
9253#endif
9254}
9255
9256//------------------------------------------------------------------------------
9257
9258inline uint32_t XMVector4GreaterOrEqualR
9259(
9260 FXMVECTOR V1,
9261 FXMVECTOR V2
9262)
9263{
9264#if defined(_XM_NO_INTRINSICS_)
9265 uint32_t CR = 0;
9266 if ((V1.vector4_f32[0] >= V2.vector4_f32[0]) &&
9267 (V1.vector4_f32[1] >= V2.vector4_f32[1]) &&
9268 (V1.vector4_f32[2] >= V2.vector4_f32[2]) &&
9269 (V1.vector4_f32[3] >= V2.vector4_f32[3]))
9270 {
9271 CR = XM_CRMASK_CR6TRUE;
9272 }
9273 else if ((V1.vector4_f32[0] < V2.vector4_f32[0]) &&
9274 (V1.vector4_f32[1] < V2.vector4_f32[1]) &&
9275 (V1.vector4_f32[2] < V2.vector4_f32[2]) &&
9276 (V1.vector4_f32[3] < V2.vector4_f32[3]))
9277 {
9278 CR = XM_CRMASK_CR6FALSE;
9279 }
9280 return CR;
9281
9282#elif defined(_XM_ARM_NEON_INTRINSICS_)
9283 __n128 vResult = vcgeq_f32( V1, V2 );
9284 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9285 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9286 uint32_t r = vget_lane_u32(vTemp.val[1], 1);
9287
9288 uint32_t CR = 0;
9289 if ( r == 0xFFFFFFFFU )
9290 {
9291 CR = XM_CRMASK_CR6TRUE;
9292 }
9293 else if ( !r )
9294 {
9295 CR = XM_CRMASK_CR6FALSE;
9296 }
9297 return CR;
9298#elif defined(_XM_SSE_INTRINSICS_)
9299 uint32_t CR = 0;
9300 XMVECTOR vTemp = _mm_cmpge_ps(V1,V2);
9301 int iTest = _mm_movemask_ps(vTemp);
9302 if (iTest==0x0f)
9303 {
9304 CR = XM_CRMASK_CR6TRUE;
9305 }
9306 else if (!iTest)
9307 {
9308 CR = XM_CRMASK_CR6FALSE;
9309 }
9310 return CR;
9311#else // _XM_VMX128_INTRINSICS_
9312#endif // _XM_VMX128_INTRINSICS_
9313}
9314
9315//------------------------------------------------------------------------------
9316
9317inline bool XMVector4Less
9318(
9319 FXMVECTOR V1,
9320 FXMVECTOR V2
9321)
9322{
9323#if defined(_XM_NO_INTRINSICS_)
9324 return (((V1.vector4_f32[0] < V2.vector4_f32[0]) && (V1.vector4_f32[1] < V2.vector4_f32[1]) && (V1.vector4_f32[2] < V2.vector4_f32[2]) && (V1.vector4_f32[3] < V2.vector4_f32[3])) != 0);
9325#elif defined(_XM_ARM_NEON_INTRINSICS_)
9326 __n128 vResult = vcltq_f32( V1, V2 );
9327 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9328 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9329 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
9330#elif defined(_XM_SSE_INTRINSICS_)
9331 XMVECTOR vTemp = _mm_cmplt_ps(V1,V2);
9332 return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
9333#else
9334 return XMComparisonAllTrue(XMVector4GreaterR(V2, V1));
9335#endif
9336}
9337
9338//------------------------------------------------------------------------------
9339
9340inline bool XMVector4LessOrEqual
9341(
9342 FXMVECTOR V1,
9343 FXMVECTOR V2
9344)
9345{
9346#if defined(_XM_NO_INTRINSICS_)
9347 return (((V1.vector4_f32[0] <= V2.vector4_f32[0]) && (V1.vector4_f32[1] <= V2.vector4_f32[1]) && (V1.vector4_f32[2] <= V2.vector4_f32[2]) && (V1.vector4_f32[3] <= V2.vector4_f32[3])) != 0);
9348#elif defined(_XM_ARM_NEON_INTRINSICS_)
9349 __n128 vResult = vcleq_f32( V1, V2 );
9350 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
9351 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9352 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
9353#elif defined(_XM_SSE_INTRINSICS_)
9354 XMVECTOR vTemp = _mm_cmple_ps(V1,V2);
9355 return ((_mm_movemask_ps(vTemp)==0x0f) != 0);
9356#else
9357 return XMComparisonAllTrue(XMVector4GreaterOrEqualR(V2, V1));
9358#endif
9359}
9360
9361//------------------------------------------------------------------------------
9362
9363inline bool XMVector4InBounds
9364(
9365 FXMVECTOR V,
9366 FXMVECTOR Bounds
9367)
9368{
9369#if defined(_XM_NO_INTRINSICS_)
9370 return (((V.vector4_f32[0] <= Bounds.vector4_f32[0] && V.vector4_f32[0] >= -Bounds.vector4_f32[0]) &&
9371 (V.vector4_f32[1] <= Bounds.vector4_f32[1] && V.vector4_f32[1] >= -Bounds.vector4_f32[1]) &&
9372 (V.vector4_f32[2] <= Bounds.vector4_f32[2] && V.vector4_f32[2] >= -Bounds.vector4_f32[2]) &&
9373 (V.vector4_f32[3] <= Bounds.vector4_f32[3] && V.vector4_f32[3] >= -Bounds.vector4_f32[3])) != 0);
9374#elif defined(_XM_ARM_NEON_INTRINSICS_)
9375 // Test if less than or equal
9376 __n128 vTemp1 = vcleq_f32(V,Bounds);
9377 // Negate the bounds
9378 __n128 vTemp2 = vnegq_f32(Bounds);
9379 // Test if greater or equal (Reversed)
9380 vTemp2 = vcleq_f32(vTemp2,V);
9381 // Blend answers
9382 vTemp1 = vandq_u32(vTemp1,vTemp2);
9383 // in bounds?
9384 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTemp1), vget_high_u8(vTemp1));
9385 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9386 return ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU );
9387#elif defined(_XM_SSE_INTRINSICS_)
9388 // Test if less than or equal
9389 XMVECTOR vTemp1 = _mm_cmple_ps(V,Bounds);
9390 // Negate the bounds
9391 XMVECTOR vTemp2 = _mm_mul_ps(Bounds,g_XMNegativeOne);
9392 // Test if greater or equal (Reversed)
9393 vTemp2 = _mm_cmple_ps(vTemp2,V);
9394 // Blend answers
9395 vTemp1 = _mm_and_ps(vTemp1,vTemp2);
9396 // All in bounds?
9397 return ((_mm_movemask_ps(vTemp1)==0x0f) != 0);
9398#else
9399 return XMComparisonAllInBounds(XMVector4InBoundsR(V, Bounds));
9400#endif
9401}
9402
9403
9404//------------------------------------------------------------------------------
9405
9406inline bool XMVector4IsNaN
9407(
9408 FXMVECTOR V
9409)
9410{
9411#if defined(_XM_NO_INTRINSICS_)
9412 return (XMISNAN(V.vector4_f32[0]) ||
9413 XMISNAN(V.vector4_f32[1]) ||
9414 XMISNAN(V.vector4_f32[2]) ||
9415 XMISNAN(V.vector4_f32[3]));
9416#elif defined(_XM_ARM_NEON_INTRINSICS_)
9417 // Test against itself. NaN is always not equal
9418 __n128 vTempNan = vceqq_f32( V, V );
9419 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempNan), vget_high_u8(vTempNan));
9420 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9421 // If any are NaN, the mask is zero
9422 return ( vget_lane_u32(vTemp.val[1], 1) != 0xFFFFFFFFU );
9423#elif defined(_XM_SSE_INTRINSICS_)
9424 // Test against itself. NaN is always not equal
9425 XMVECTOR vTempNan = _mm_cmpneq_ps(V,V);
9426 // If any are NaN, the mask is non-zero
9427 return (_mm_movemask_ps(vTempNan)!=0);
9428#else // _XM_VMX128_INTRINSICS_
9429#endif // _XM_VMX128_INTRINSICS_
9430}
9431
9432//------------------------------------------------------------------------------
9433
9434inline bool XMVector4IsInfinite
9435(
9436 FXMVECTOR V
9437)
9438{
9439#if defined(_XM_NO_INTRINSICS_)
9440
9441 return (XMISINF(V.vector4_f32[0]) ||
9442 XMISINF(V.vector4_f32[1]) ||
9443 XMISINF(V.vector4_f32[2]) ||
9444 XMISINF(V.vector4_f32[3]));
9445
9446#elif defined(_XM_ARM_NEON_INTRINSICS_)
9447 // Mask off the sign bit
9448 __n128 vTempInf = vandq_u32( V, g_XMAbsMask );
9449 // Compare to infinity
9450 vTempInf = vceqq_f32(vTempInf, g_XMInfinity );
9451 // If any are infinity, the signs are true.
9452 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vTempInf), vget_high_u8(vTempInf));
9453 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
9454 return ( vget_lane_u32(vTemp.val[1], 1) != 0 );
9455#elif defined(_XM_SSE_INTRINSICS_)
9456 // Mask off the sign bit
9457 XMVECTOR vTemp = _mm_and_ps(V,g_XMAbsMask);
9458 // Compare to infinity
9459 vTemp = _mm_cmpeq_ps(vTemp,g_XMInfinity);
9460 // If any are infinity, the signs are true.
9461 return (_mm_movemask_ps(vTemp) != 0);
9462#else // _XM_VMX128_INTRINSICS_
9463#endif // _XM_VMX128_INTRINSICS_
9464}
9465
9466//------------------------------------------------------------------------------
9467// Computation operations
9468//------------------------------------------------------------------------------
9469
9470//------------------------------------------------------------------------------
9471
9472inline XMVECTOR XMVector4Dot
9473(
9474 FXMVECTOR V1,
9475 FXMVECTOR V2
9476)
9477{
9478#if defined(_XM_NO_INTRINSICS_)
9479
9480 XMVECTOR Result;
9481 Result.vector4_f32[0] =
9482 Result.vector4_f32[1] =
9483 Result.vector4_f32[2] =
9484 Result.vector4_f32[3] = V1.vector4_f32[0] * V2.vector4_f32[0] + V1.vector4_f32[1] * V2.vector4_f32[1] + V1.vector4_f32[2] * V2.vector4_f32[2] + V1.vector4_f32[3] * V2.vector4_f32[3];
9485 return Result;
9486
9487#elif defined(_XM_ARM_NEON_INTRINSICS_)
9488 __n128 vTemp = vmulq_f32( V1, V2 );
9489 __n64 v1 = vget_low_f32( vTemp );
9490 __n64 v2 = vget_high_f32( vTemp );
9491 v1 = vpadd_f32( v1, v1 );
9492 v2 = vpadd_f32( v2, v2 );
9493 v1 = vadd_f32( v1, v2 );
9494 return vcombine_f32( v1, v1 );
9495#elif defined(_XM_SSE_INTRINSICS_)
9496 XMVECTOR vTemp2 = V2;
9497 XMVECTOR vTemp = _mm_mul_ps(V1,vTemp2);
9498 vTemp2 = _mm_shuffle_ps(vTemp2,vTemp,_MM_SHUFFLE(1,0,0,0)); // Copy X to the Z position and Y to the W position
9499 vTemp2 = _mm_add_ps(vTemp2,vTemp); // Add Z = X+Z; W = Y+W;
9500 vTemp = _mm_shuffle_ps(vTemp,vTemp2,_MM_SHUFFLE(0,3,0,0)); // Copy W to the Z position
9501 vTemp = _mm_add_ps(vTemp,vTemp2); // Add Z and W together
9502 return XM_PERMUTE_PS(vTemp,_MM_SHUFFLE(2,2,2,2)); // Splat Z and return
9503#else // _XM_VMX128_INTRINSICS_
9504#endif // _XM_VMX128_INTRINSICS_
9505}
9506
9507//------------------------------------------------------------------------------
9508
9509inline XMVECTOR XMVector4Cross
9510(
9511 FXMVECTOR V1,
9512 FXMVECTOR V2,
9513 FXMVECTOR V3
9514)
9515{
9516 // [ ((v2.z*v3.w-v2.w*v3.z)*v1.y)-((v2.y*v3.w-v2.w*v3.y)*v1.z)+((v2.y*v3.z-v2.z*v3.y)*v1.w),
9517 // ((v2.w*v3.z-v2.z*v3.w)*v1.x)-((v2.w*v3.x-v2.x*v3.w)*v1.z)+((v2.z*v3.x-v2.x*v3.z)*v1.w),
9518 // ((v2.y*v3.w-v2.w*v3.y)*v1.x)-((v2.x*v3.w-v2.w*v3.x)*v1.y)+((v2.x*v3.y-v2.y*v3.x)*v1.w),
9519 // ((v2.z*v3.y-v2.y*v3.z)*v1.x)-((v2.z*v3.x-v2.x*v3.z)*v1.y)+((v2.y*v3.x-v2.x*v3.y)*v1.z) ]
9520
9521#if defined(_XM_NO_INTRINSICS_)
9522 XMVECTOR Result;
9523
9524 Result.vector4_f32[0] = (((V2.vector4_f32[2]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[2]))*V1.vector4_f32[1])-(((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[2])+(((V2.vector4_f32[1]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[1]))*V1.vector4_f32[3]);
9525 Result.vector4_f32[1] = (((V2.vector4_f32[3]*V3.vector4_f32[2])-(V2.vector4_f32[2]*V3.vector4_f32[3]))*V1.vector4_f32[0])-(((V2.vector4_f32[3]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[3]))*V1.vector4_f32[2])+(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[3]);
9526 Result.vector4_f32[2] = (((V2.vector4_f32[1]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[1]))*V1.vector4_f32[0])-(((V2.vector4_f32[0]*V3.vector4_f32[3])-(V2.vector4_f32[3]*V3.vector4_f32[0]))*V1.vector4_f32[1])+(((V2.vector4_f32[0]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[0]))*V1.vector4_f32[3]);
9527 Result.vector4_f32[3] = (((V2.vector4_f32[2]*V3.vector4_f32[1])-(V2.vector4_f32[1]*V3.vector4_f32[2]))*V1.vector4_f32[0])-(((V2.vector4_f32[2]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[2]))*V1.vector4_f32[1])+(((V2.vector4_f32[1]*V3.vector4_f32[0])-(V2.vector4_f32[0]*V3.vector4_f32[1]))*V1.vector4_f32[2]);
9528 return Result;
9529
9530#elif defined(_XM_ARM_NEON_INTRINSICS_)
9531 const __n64 select = vget_low_f32( g_XMMaskX );
9532
9533 // Term1: V2zwyz * V3wzwy
9534 const __n64 v2xy = vget_low_f32(V2);
9535 const __n64 v2zw = vget_high_f32(V2);
9536 const __n64 v2yx = vrev64_f32(v2xy);
9537 const __n64 v2wz = vrev64_f32(v2zw);
9538 const __n64 v2yz = vbsl_f32( select, v2yx, v2wz );
9539
9540 const __n64 v3zw = vget_high_f32(V3);
9541 const __n64 v3wz = vrev64_f32(v3zw);
9542 const __n64 v3xy = vget_low_f32(V3);
9543 const __n64 v3wy = vbsl_f32( select, v3wz, v3xy );
9544
9545 __n128 vTemp1 = vcombine_f32(v2zw,v2yz);
9546 __n128 vTemp2 = vcombine_f32(v3wz,v3wy);
9547 __n128 vResult = vmulq_f32( vTemp1, vTemp2 );
9548
9549 // - V2wzwy * V3zwyz
9550 const __n64 v2wy = vbsl_f32( select, v2wz, v2xy );
9551
9552 const __n64 v3yx = vrev64_f32(v3xy);
9553 const __n64 v3yz = vbsl_f32( select, v3yx, v3wz );
9554
9555 vTemp1 = vcombine_f32(v2wz,v2wy);
9556 vTemp2 = vcombine_f32(v3zw,v3yz);
9557 vResult = vmlsq_f32( vResult, vTemp1, vTemp2 );
9558
9559 // term1 * V1yxxx
9560 const __n64 v1xy = vget_low_f32(V1);
9561 const __n64 v1yx = vrev64_f32(v1xy);
9562
9563 vTemp1 = vcombine_f32( v1yx, vdup_lane_f32( v1yx, 1 ) );
9564 vResult = vmulq_f32( vResult, vTemp1 );
9565
9566 // Term2: V2ywxz * V3wxwx
9567 const __n64 v2yw = vrev64_f32(v2wy);
9568 const __n64 v2xz = vbsl_f32( select, v2xy, v2wz );
9569
9570 const __n64 v3wx = vbsl_f32( select, v3wz, v3yx );
9571
9572 vTemp1 = vcombine_f32(v2yw,v2xz);
9573 vTemp2 = vcombine_f32(v3wx,v3wx);
9574 __n128 vTerm = vmulq_f32( vTemp1, vTemp2 );
9575
9576 // - V2wxwx * V3ywxz
9577 const __n64 v2wx = vbsl_f32( select, v2wz, v2yx );
9578
9579 const __n64 v3yw = vrev64_f32(v3wy);
9580 const __n64 v3xz = vbsl_f32( select, v3xy, v3wz );
9581
9582 vTemp1 = vcombine_f32(v2wx,v2wx);
9583 vTemp2 = vcombine_f32(v3yw,v3xz);
9584 vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
9585
9586 // vResult - term2 * V1zzyy
9587 const __n64 v1zw = vget_high_f32(V1);
9588
9589 vTemp1 = vcombine_f32( vdup_lane_f32(v1zw, 0), vdup_lane_f32(v1yx, 0) );
9590 vResult = vmlsq_f32( vResult, vTerm, vTemp1 );
9591
9592 // Term3: V2yzxy * V3zxyx
9593 const __n64 v3zx = vrev64_f32(v3xz);
9594
9595 vTemp1 = vcombine_f32(v2yz,v2xy);
9596 vTemp2 = vcombine_f32(v3zx,v3yx);
9597 vTerm = vmulq_f32( vTemp1, vTemp2 );
9598
9599 // - V2zxyx * V3yzxy
9600 const __n64 v2zx = vrev64_f32(v2xz);
9601
9602 vTemp1 = vcombine_f32(v2zx,v2yx);
9603 vTemp2 = vcombine_f32(v3yz,v3xy);
9604 vTerm = vmlsq_f32( vTerm, vTemp1, vTemp2 );
9605
9606 // vResult + term3 * V1wwwz
9607 const __n64 v1wz = vrev64_f32(v1zw);
9608
9609 vTemp1 = vcombine_f32( vdup_lane_f32( v1wz, 0 ), v1wz );
9610 return vmlaq_f32( vResult, vTerm, vTemp1 );
9611#elif defined(_XM_SSE_INTRINSICS_)
9612 // V2zwyz * V3wzwy
9613 XMVECTOR vResult = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,1,3,2));
9614 XMVECTOR vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,3,2,3));
9615 vResult = _mm_mul_ps(vResult,vTemp3);
9616 // - V2wzwy * V3zwyz
9617 XMVECTOR vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,3,2,3));
9618 vTemp3 = XM_PERMUTE_PS(vTemp3,_MM_SHUFFLE(1,3,0,1));
9619 vTemp2 = _mm_mul_ps(vTemp2,vTemp3);
9620 vResult = _mm_sub_ps(vResult,vTemp2);
9621 // term1 * V1yxxx
9622 XMVECTOR vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(0,0,0,1));
9623 vResult = _mm_mul_ps(vResult,vTemp1);
9624
9625 // V2ywxz * V3wxwx
9626 vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(2,0,3,1));
9627 vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,3,0,3));
9628 vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
9629 // - V2wxwx * V3ywxz
9630 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,1,2,1));
9631 vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(2,0,3,1));
9632 vTemp2 = _mm_mul_ps(vTemp2,vTemp1);
9633 vTemp3 = _mm_sub_ps(vTemp3,vTemp2);
9634 // vResult - temp * V1zzyy
9635 vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(1,1,2,2));
9636 vTemp1 = _mm_mul_ps(vTemp1,vTemp3);
9637 vResult = _mm_sub_ps(vResult,vTemp1);
9638
9639 // V2yzxy * V3zxyx
9640 vTemp2 = XM_PERMUTE_PS(V2,_MM_SHUFFLE(1,0,2,1));
9641 vTemp3 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(0,1,0,2));
9642 vTemp3 = _mm_mul_ps(vTemp3,vTemp2);
9643 // - V2zxyx * V3yzxy
9644 vTemp2 = XM_PERMUTE_PS(vTemp2,_MM_SHUFFLE(2,0,2,1));
9645 vTemp1 = XM_PERMUTE_PS(V3,_MM_SHUFFLE(1,0,2,1));
9646 vTemp1 = _mm_mul_ps(vTemp1,vTemp2);
9647 vTemp3 = _mm_sub_ps(vTemp3,vTemp1);
9648 // vResult + term * V1wwwz
9649 vTemp1 = XM_PERMUTE_PS(V1,_MM_SHUFFLE(2,3,3,3));
9650 vTemp3 = _mm_mul_ps(vTemp3,vTemp1);
9651 vResult = _mm_add_ps(vResult,vTemp3);
9652 return vResult;
9653#else // _XM_VMX128_INTRINSICS_
9654#endif // _XM_VMX128_INTRINSICS_
9655}
9656
9657//------------------------------------------------------------------------------
9658
9659inline XMVECTOR XMVector4LengthSq
9660(
9661 FXMVECTOR V
9662)
9663{
9664 return XMVector4Dot(V, V);
9665}
9666
9667//------------------------------------------------------------------------------
9668
9669inline XMVECTOR XMVector4ReciprocalLengthEst
9670(
9671 FXMVECTOR V
9672)
9673{
9674#if defined(_XM_NO_INTRINSICS_)
9675
9676 XMVECTOR Result;
9677
9678 Result = XMVector4LengthSq(V);
9679 Result = XMVectorReciprocalSqrtEst(Result);
9680
9681 return Result;
9682
9683#elif defined(_XM_ARM_NEON_INTRINSICS_)
9684 // Dot4
9685 __n128 vTemp = vmulq_f32( V, V );
9686 __n64 v1 = vget_low_f32( vTemp );
9687 __n64 v2 = vget_high_f32( vTemp );
9688 v1 = vpadd_f32( v1, v1 );
9689 v2 = vpadd_f32( v2, v2 );
9690 v1 = vadd_f32( v1, v2 );
9691 // Reciprocal sqrt (estimate)
9692 v2 = vrsqrte_f32( v1 );
9693 return vcombine_f32(v2, v2);
9694#elif defined(_XM_SSE_INTRINSICS_)
9695 // Perform the dot product on x,y,z and w
9696 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
9697 // vTemp has z and w
9698 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
9699 // x+z, y+w
9700 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9701 // x+z,x+z,x+z,y+w
9702 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
9703 // ??,??,y+w,y+w
9704 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
9705 // ??,??,x+z+y+w,??
9706 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9707 // Splat the length
9708 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
9709 // Get the reciprocal
9710 vLengthSq = _mm_rsqrt_ps(vLengthSq);
9711 return vLengthSq;
9712#else // _XM_VMX128_INTRINSICS_
9713#endif // _XM_VMX128_INTRINSICS_
9714}
9715
9716//------------------------------------------------------------------------------
9717
9718inline XMVECTOR XMVector4ReciprocalLength
9719(
9720 FXMVECTOR V
9721)
9722{
9723#if defined(_XM_NO_INTRINSICS_)
9724
9725 XMVECTOR Result;
9726
9727 Result = XMVector4LengthSq(V);
9728 Result = XMVectorReciprocalSqrt(Result);
9729
9730 return Result;
9731
9732#elif defined(_XM_ARM_NEON_INTRINSICS_)
9733 // Dot4
9734 __n128 vTemp = vmulq_f32( V, V );
9735 __n64 v1 = vget_low_f32( vTemp );
9736 __n64 v2 = vget_high_f32( vTemp );
9737 v1 = vpadd_f32( v1, v1 );
9738 v2 = vpadd_f32( v2, v2 );
9739 v1 = vadd_f32( v1, v2 );
9740 // Reciprocal sqrt
9741 __n64 S0 = vrsqrte_f32(v1);
9742 __n64 P0 = vmul_f32( v1, S0 );
9743 __n64 R0 = vrsqrts_f32( P0, S0 );
9744 __n64 S1 = vmul_f32( S0, R0 );
9745 __n64 P1 = vmul_f32( v1, S1 );
9746 __n64 R1 = vrsqrts_f32( P1, S1 );
9747 __n64 Result = vmul_f32( S1, R1 );
9748 return vcombine_f32( Result, Result );
9749#elif defined(_XM_SSE_INTRINSICS_)
9750 // Perform the dot product on x,y,z and w
9751 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
9752 // vTemp has z and w
9753 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
9754 // x+z, y+w
9755 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9756 // x+z,x+z,x+z,y+w
9757 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
9758 // ??,??,y+w,y+w
9759 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
9760 // ??,??,x+z+y+w,??
9761 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9762 // Splat the length
9763 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
9764 // Get the reciprocal
9765 vLengthSq = _mm_sqrt_ps(vLengthSq);
9766 // Accurate!
9767 vLengthSq = _mm_div_ps(g_XMOne,vLengthSq);
9768 return vLengthSq;
9769#else // _XM_VMX128_INTRINSICS_
9770#endif // _XM_VMX128_INTRINSICS_
9771}
9772
9773//------------------------------------------------------------------------------
9774
9775inline XMVECTOR XMVector4LengthEst
9776(
9777 FXMVECTOR V
9778)
9779{
9780#if defined(_XM_NO_INTRINSICS_)
9781
9782 XMVECTOR Result;
9783
9784 Result = XMVector4LengthSq(V);
9785 Result = XMVectorSqrtEst(Result);
9786
9787 return Result;
9788
9789#elif defined(_XM_ARM_NEON_INTRINSICS_)
9790 // Dot4
9791 __n128 vTemp = vmulq_f32( V, V );
9792 __n64 v1 = vget_low_f32( vTemp );
9793 __n64 v2 = vget_high_f32( vTemp );
9794 v1 = vpadd_f32( v1, v1 );
9795 v2 = vpadd_f32( v2, v2 );
9796 v1 = vadd_f32( v1, v2 );
9797 const __n64 zero = vdup_n_u32(0);
9798 __n64 VEqualsZero = vceq_f32( v1, zero );
9799 // Sqrt (estimate)
9800 __n64 Result = vrsqrte_f32( v1 );
9801 Result = vmul_f32( v1, Result );
9802 Result = vbsl_f32( VEqualsZero, zero, Result );
9803 return vcombine_f32( Result, Result );
9804#elif defined(_XM_SSE_INTRINSICS_)
9805 // Perform the dot product on x,y,z and w
9806 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
9807 // vTemp has z and w
9808 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
9809 // x+z, y+w
9810 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9811 // x+z,x+z,x+z,y+w
9812 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
9813 // ??,??,y+w,y+w
9814 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
9815 // ??,??,x+z+y+w,??
9816 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9817 // Splat the length
9818 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
9819 // Prepare for the division
9820 vLengthSq = _mm_sqrt_ps(vLengthSq);
9821 return vLengthSq;
9822#else // _XM_VMX128_INTRINSICS_
9823#endif // _XM_VMX128_INTRINSICS_
9824}
9825
9826//------------------------------------------------------------------------------
9827
9828inline XMVECTOR XMVector4Length
9829(
9830 FXMVECTOR V
9831)
9832{
9833#if defined(_XM_NO_INTRINSICS_)
9834
9835 XMVECTOR Result;
9836
9837 Result = XMVector4LengthSq(V);
9838 Result = XMVectorSqrt(Result);
9839
9840 return Result;
9841
9842#elif defined(_XM_ARM_NEON_INTRINSICS_)
9843 // Dot4
9844 __n128 vTemp = vmulq_f32( V, V );
9845 __n64 v1 = vget_low_f32( vTemp );
9846 __n64 v2 = vget_high_f32( vTemp );
9847 v1 = vpadd_f32( v1, v1 );
9848 v2 = vpadd_f32( v2, v2 );
9849 v1 = vadd_f32( v1, v2 );
9850 const __n64 zero = vdup_n_u32(0);
9851 __n64 VEqualsZero = vceq_f32( v1, zero );
9852 // Sqrt
9853 __n64 S0 = vrsqrte_f32( v1 );
9854 __n64 P0 = vmul_f32( v1, S0 );
9855 __n64 R0 = vrsqrts_f32( P0, S0 );
9856 __n64 S1 = vmul_f32( S0, R0 );
9857 __n64 P1 = vmul_f32( v1, S1 );
9858 __n64 R1 = vrsqrts_f32( P1, S1 );
9859 __n64 Result = vmul_f32( S1, R1 );
9860 Result = vmul_f32( v1, Result );
9861 Result = vbsl_f32( VEqualsZero, zero, Result );
9862 return vcombine_f32( Result, Result );
9863#elif defined(_XM_SSE_INTRINSICS_)
9864 // Perform the dot product on x,y,z and w
9865 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
9866 // vTemp has z and w
9867 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
9868 // x+z, y+w
9869 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9870 // x+z,x+z,x+z,y+w
9871 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
9872 // ??,??,y+w,y+w
9873 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
9874 // ??,??,x+z+y+w,??
9875 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9876 // Splat the length
9877 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
9878 // Prepare for the division
9879 vLengthSq = _mm_sqrt_ps(vLengthSq);
9880 return vLengthSq;
9881#else // _XM_VMX128_INTRINSICS_
9882#endif // _XM_VMX128_INTRINSICS_
9883}
9884
9885//------------------------------------------------------------------------------
9886// XMVector4NormalizeEst uses a reciprocal estimate and
9887// returns QNaN on zero and infinite vectors.
9888
9889inline XMVECTOR XMVector4NormalizeEst
9890(
9891 FXMVECTOR V
9892)
9893{
9894#if defined(_XM_NO_INTRINSICS_)
9895
9896 XMVECTOR Result;
9897 Result = XMVector4ReciprocalLength(V);
9898 Result = XMVectorMultiply(V, Result);
9899 return Result;
9900
9901#elif defined(_XM_ARM_NEON_INTRINSICS_)
9902 // Dot4
9903 __n128 vTemp = vmulq_f32( V, V );
9904 __n64 v1 = vget_low_f32( vTemp );
9905 __n64 v2 = vget_high_f32( vTemp );
9906 v1 = vpadd_f32( v1, v1 );
9907 v2 = vpadd_f32( v2, v2 );
9908 v1 = vadd_f32( v1, v2 );
9909 // Reciprocal sqrt (estimate)
9910 v2 = vrsqrte_f32( v1 );
9911 // Normalize
9912 return vmulq_f32( V, vcombine_f32(v2,v2) );
9913#elif defined(_XM_SSE_INTRINSICS_)
9914 // Perform the dot product on x,y,z and w
9915 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
9916 // vTemp has z and w
9917 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
9918 // x+z, y+w
9919 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9920 // x+z,x+z,x+z,y+w
9921 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
9922 // ??,??,y+w,y+w
9923 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
9924 // ??,??,x+z+y+w,??
9925 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9926 // Splat the length
9927 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
9928 // Get the reciprocal
9929 XMVECTOR vResult = _mm_rsqrt_ps(vLengthSq);
9930 // Reciprocal mul to perform the normalization
9931 vResult = _mm_mul_ps(vResult,V);
9932 return vResult;
9933#else // _XM_VMX128_INTRINSICS_
9934#endif // _XM_VMX128_INTRINSICS_
9935}
9936
9937//------------------------------------------------------------------------------
9938
9939inline XMVECTOR XMVector4Normalize
9940(
9941 FXMVECTOR V
9942)
9943{
9944#if defined(_XM_NO_INTRINSICS_)
9945 float fLength;
9946 XMVECTOR vResult;
9947
9948 vResult = XMVector4Length( V );
9949 fLength = vResult.vector4_f32[0];
9950
9951 // Prevent divide by zero
9952 if (fLength > 0) {
9953 fLength = 1.0f/fLength;
9954 }
9955
9956 vResult.vector4_f32[0] = V.vector4_f32[0]*fLength;
9957 vResult.vector4_f32[1] = V.vector4_f32[1]*fLength;
9958 vResult.vector4_f32[2] = V.vector4_f32[2]*fLength;
9959 vResult.vector4_f32[3] = V.vector4_f32[3]*fLength;
9960 return vResult;
9961
9962#elif defined(_XM_ARM_NEON_INTRINSICS_)
9963 // Dot4
9964 __n128 vTemp = vmulq_f32( V, V );
9965 __n64 v1 = vget_low_f32( vTemp );
9966 __n64 v2 = vget_high_f32( vTemp );
9967 v1 = vpadd_f32( v1, v1 );
9968 v2 = vpadd_f32( v2, v2 );
9969 v1 = vadd_f32( v1, v2 );
9970 __n64 VEqualsZero = vceq_f32( v1, vdup_n_u32(0) );
9971 __n64 VEqualsInf = vceq_f32( v1, vget_low_f32(g_XMInfinity) );
9972 // Reciprocal sqrt (2 iterations of Newton-Raphson)
9973 __n64 S0 = vrsqrte_f32( v1 );
9974 __n64 P0 = vmul_f32( v1, S0 );
9975 __n64 R0 = vrsqrts_f32( P0, S0 );
9976 __n64 S1 = vmul_f32( S0, R0 );
9977 __n64 P1 = vmul_f32( v1, S1 );
9978 __n64 R1 = vrsqrts_f32( P1, S1 );
9979 v2 = vmul_f32( S1, R1 );
9980 // Normalize
9981 __n128 vResult = vmulq_f32( V, vcombine_f32(v2,v2) );
9982 vResult = vbslq_f32( vcombine_f32(VEqualsZero,VEqualsZero), vdupq_n_f32(0), vResult );
9983 return vbslq_f32( vcombine_f32(VEqualsInf,VEqualsInf), g_XMQNaN, vResult );
9984#elif defined(_XM_SSE_INTRINSICS_)
9985 // Perform the dot product on x,y,z and w
9986 XMVECTOR vLengthSq = _mm_mul_ps(V,V);
9987 // vTemp has z and w
9988 XMVECTOR vTemp = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(3,2,3,2));
9989 // x+z, y+w
9990 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9991 // x+z,x+z,x+z,y+w
9992 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(1,0,0,0));
9993 // ??,??,y+w,y+w
9994 vTemp = _mm_shuffle_ps(vTemp,vLengthSq,_MM_SHUFFLE(3,3,0,0));
9995 // ??,??,x+z+y+w,??
9996 vLengthSq = _mm_add_ps(vLengthSq,vTemp);
9997 // Splat the length
9998 vLengthSq = XM_PERMUTE_PS(vLengthSq,_MM_SHUFFLE(2,2,2,2));
9999 // Prepare for the division
10000 XMVECTOR vResult = _mm_sqrt_ps(vLengthSq);
10001 // Create zero with a single instruction
10002 XMVECTOR vZeroMask = _mm_setzero_ps();
10003 // Test for a divide by zero (Must be FP to detect -0.0)
10004 vZeroMask = _mm_cmpneq_ps(vZeroMask,vResult);
10005 // Failsafe on zero (Or epsilon) length planes
10006 // If the length is infinity, set the elements to zero
10007 vLengthSq = _mm_cmpneq_ps(vLengthSq,g_XMInfinity);
10008 // Divide to perform the normalization
10009 vResult = _mm_div_ps(V,vResult);
10010 // Any that are infinity, set to zero
10011 vResult = _mm_and_ps(vResult,vZeroMask);
10012 // Select qnan or result based on infinite length
10013 XMVECTOR vTemp1 = _mm_andnot_ps(vLengthSq,g_XMQNaN);
10014 XMVECTOR vTemp2 = _mm_and_ps(vResult,vLengthSq);
10015 vResult = _mm_or_ps(vTemp1,vTemp2);
10016 return vResult;
10017#else // _XM_VMX128_INTRINSICS_
10018#endif // _XM_VMX128_INTRINSICS_
10019}
10020
10021//------------------------------------------------------------------------------
10022
10023inline XMVECTOR XMVector4ClampLength
10024(
10025 FXMVECTOR V,
10026 float LengthMin,
10027 float LengthMax
10028)
10029{
10030 XMVECTOR ClampMax = XMVectorReplicate(LengthMax);
10031 XMVECTOR ClampMin = XMVectorReplicate(LengthMin);
10032
10033 return XMVector4ClampLengthV(V, ClampMin, ClampMax);
10034}
10035
10036//------------------------------------------------------------------------------
10037
10038inline XMVECTOR XMVector4ClampLengthV
10039(
10040 FXMVECTOR V,
10041 FXMVECTOR LengthMin,
10042 FXMVECTOR LengthMax
10043)
10044{
10045 assert((XMVectorGetY(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetZ(LengthMin) == XMVectorGetX(LengthMin)) && (XMVectorGetW(LengthMin) == XMVectorGetX(LengthMin)));
10046 assert((XMVectorGetY(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetZ(LengthMax) == XMVectorGetX(LengthMax)) && (XMVectorGetW(LengthMax) == XMVectorGetX(LengthMax)));
10047 assert(XMVector4GreaterOrEqual(LengthMin, XMVectorZero()));
10048 assert(XMVector4GreaterOrEqual(LengthMax, XMVectorZero()));
10049 assert(XMVector4GreaterOrEqual(LengthMax, LengthMin));
10050
10051 XMVECTOR LengthSq = XMVector4LengthSq(V);
10052
10053 const XMVECTOR Zero = XMVectorZero();
10054
10055 XMVECTOR RcpLength = XMVectorReciprocalSqrt(LengthSq);
10056
10057 XMVECTOR InfiniteLength = XMVectorEqualInt(LengthSq, g_XMInfinity.v);
10058 XMVECTOR ZeroLength = XMVectorEqual(LengthSq, Zero);
10059
10060 XMVECTOR Normal = XMVectorMultiply(V, RcpLength);
10061
10062 XMVECTOR Length = XMVectorMultiply(LengthSq, RcpLength);
10063
10064 XMVECTOR Select = XMVectorEqualInt(InfiniteLength, ZeroLength);
10065 Length = XMVectorSelect(LengthSq, Length, Select);
10066 Normal = XMVectorSelect(LengthSq, Normal, Select);
10067
10068 XMVECTOR ControlMax = XMVectorGreater(Length, LengthMax);
10069 XMVECTOR ControlMin = XMVectorLess(Length, LengthMin);
10070
10071 XMVECTOR ClampLength = XMVectorSelect(Length, LengthMax, ControlMax);
10072 ClampLength = XMVectorSelect(ClampLength, LengthMin, ControlMin);
10073
10074 XMVECTOR Result = XMVectorMultiply(Normal, ClampLength);
10075
10076 // Preserve the original vector (with no precision loss) if the length falls within the given range
10077 XMVECTOR Control = XMVectorEqualInt(ControlMax, ControlMin);
10078 Result = XMVectorSelect(Result, V, Control);
10079
10080 return Result;
10081}
10082
10083//------------------------------------------------------------------------------
10084
10085inline XMVECTOR XMVector4Reflect
10086(
10087 FXMVECTOR Incident,
10088 FXMVECTOR Normal
10089)
10090{
10091 // Result = Incident - (2 * dot(Incident, Normal)) * Normal
10092
10093 XMVECTOR Result = XMVector4Dot(Incident, Normal);
10094 Result = XMVectorAdd(Result, Result);
10095 Result = XMVectorNegativeMultiplySubtract(Result, Normal, Incident);
10096
10097 return Result;
10098}
10099
10100//------------------------------------------------------------------------------
10101
10102inline XMVECTOR XMVector4Refract
10103(
10104 FXMVECTOR Incident,
10105 FXMVECTOR Normal,
10106 float RefractionIndex
10107)
10108{
10109 XMVECTOR Index = XMVectorReplicate(RefractionIndex);
10110 return XMVector4RefractV(Incident, Normal, Index);
10111}
10112
10113//------------------------------------------------------------------------------
10114
10115inline XMVECTOR XMVector4RefractV
10116(
10117 FXMVECTOR Incident,
10118 FXMVECTOR Normal,
10119 FXMVECTOR RefractionIndex
10120)
10121{
10122#if defined(_XM_NO_INTRINSICS_)
10123
10124 XMVECTOR IDotN;
10125 XMVECTOR R;
10126 const XMVECTOR Zero = XMVectorZero();
10127
10128 // Result = RefractionIndex * Incident - Normal * (RefractionIndex * dot(Incident, Normal) +
10129 // sqrt(1 - RefractionIndex * RefractionIndex * (1 - dot(Incident, Normal) * dot(Incident, Normal))))
10130
10131 IDotN = XMVector4Dot(Incident, Normal);
10132
10133 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
10134 R = XMVectorNegativeMultiplySubtract(IDotN, IDotN, g_XMOne.v);
10135 R = XMVectorMultiply(R, RefractionIndex);
10136 R = XMVectorNegativeMultiplySubtract(R, RefractionIndex, g_XMOne.v);
10137
10138 if (XMVector4LessOrEqual(R, Zero))
10139 {
10140 // Total internal reflection
10141 return Zero;
10142 }
10143 else
10144 {
10145 XMVECTOR Result;
10146
10147 // R = RefractionIndex * IDotN + sqrt(R)
10148 R = XMVectorSqrt(R);
10149 R = XMVectorMultiplyAdd(RefractionIndex, IDotN, R);
10150
10151 // Result = RefractionIndex * Incident - Normal * R
10152 Result = XMVectorMultiply(RefractionIndex, Incident);
10153 Result = XMVectorNegativeMultiplySubtract(Normal, R, Result);
10154
10155 return Result;
10156 }
10157
10158#elif defined(_XM_ARM_NEON_INTRINSICS_)
10159 XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
10160
10161 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
10162 __n128 R = vmlsq_f32( g_XMOne, IDotN, IDotN);
10163 R = vmulq_f32(R, RefractionIndex);
10164 R = vmlsq_f32(g_XMOne, R, RefractionIndex );
10165
10166 __n128 vResult = vcleq_f32(R,g_XMZero);
10167 int8x8x2_t vTemp = vzip_u8(vget_low_u8(vResult), vget_high_u8(vResult));
10168 vTemp = vzip_u16(vTemp.val[0], vTemp.val[1]);
10169 if ( vget_lane_u32(vTemp.val[1], 1) == 0xFFFFFFFFU )
10170 {
10171 // Total internal reflection
10172 vResult = g_XMZero;
10173 }
10174 else
10175 {
10176 // Sqrt(R)
10177 __n128 S0 = vrsqrteq_f32(R);
10178 __n128 P0 = vmulq_f32( R, S0 );
10179 __n128 R0 = vrsqrtsq_f32( P0, S0 );
10180 __n128 S1 = vmulq_f32( S0, R0 );
10181 __n128 P1 = vmulq_f32( R, S1 );
10182 __n128 R1 = vrsqrtsq_f32( P1, S1 );
10183 __n128 S2 = vmulq_f32( S1, R1 );
10184 R = vmulq_f32( R, S2 );
10185 // R = RefractionIndex * IDotN + sqrt(R)
10186 R = vmlaq_f32( R, RefractionIndex, IDotN );
10187 // Result = RefractionIndex * Incident - Normal * R
10188 vResult = vmulq_f32(RefractionIndex, Incident);
10189 vResult = vmlsq_f32( vResult, R, Normal );
10190 }
10191 return vResult;
10192#elif defined(_XM_SSE_INTRINSICS_)
10193 XMVECTOR IDotN = XMVector4Dot(Incident,Normal);
10194
10195 // R = 1.0f - RefractionIndex * RefractionIndex * (1.0f - IDotN * IDotN)
10196 XMVECTOR R = _mm_mul_ps(IDotN,IDotN);
10197 R = _mm_sub_ps(g_XMOne,R);
10198 R = _mm_mul_ps(R, RefractionIndex);
10199 R = _mm_mul_ps(R, RefractionIndex);
10200 R = _mm_sub_ps(g_XMOne,R);
10201
10202 XMVECTOR vResult = _mm_cmple_ps(R,g_XMZero);
10203 if (_mm_movemask_ps(vResult)==0x0f)
10204 {
10205 // Total internal reflection
10206 vResult = g_XMZero;
10207 }
10208 else
10209 {
10210 // R = RefractionIndex * IDotN + sqrt(R)
10211 R = _mm_sqrt_ps(R);
10212 vResult = _mm_mul_ps(RefractionIndex, IDotN);
10213 R = _mm_add_ps(R,vResult);
10214 // Result = RefractionIndex * Incident - Normal * R
10215 vResult = _mm_mul_ps(RefractionIndex, Incident);
10216 R = _mm_mul_ps(R,Normal);
10217 vResult = _mm_sub_ps(vResult,R);
10218 }
10219 return vResult;
10220#else // _XM_VMX128_INTRINSICS_
10221#endif // _XM_VMX128_INTRINSICS_
10222}
10223
10224//------------------------------------------------------------------------------
10225
10226inline XMVECTOR XMVector4Orthogonal
10227(
10228 FXMVECTOR V
10229)
10230{
10231#if defined(_XM_NO_INTRINSICS_)
10232
10233 XMVECTOR Result;
10234 Result.vector4_f32[0] = V.vector4_f32[2];
10235 Result.vector4_f32[1] = V.vector4_f32[3];
10236 Result.vector4_f32[2] = -V.vector4_f32[0];
10237 Result.vector4_f32[3] = -V.vector4_f32[1];
10238 return Result;
10239
10240#elif defined(_XM_ARM_NEON_INTRINSICS_)
10241 static const XMVECTORF32 Negate = { 1.f, 1.f, -1.f, -1.f };
10242
10243 __n128 Result = vcombine_f32( vget_high_f32( V ), vget_low_f32( V ) );
10244 return vmulq_f32( Result, Negate );
10245#elif defined(_XM_SSE_INTRINSICS_)
10246 static const XMVECTORF32 FlipZW = {1.0f,1.0f,-1.0f,-1.0f};
10247 XMVECTOR vResult = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,0,3,2));
10248 vResult = _mm_mul_ps(vResult,FlipZW);
10249 return vResult;
10250#else // _XM_VMX128_INTRINSICS_
10251#endif // _XM_VMX128_INTRINSICS_
10252}
10253
10254//------------------------------------------------------------------------------
10255
10256inline XMVECTOR XMVector4AngleBetweenNormalsEst
10257(
10258 FXMVECTOR N1,
10259 FXMVECTOR N2
10260)
10261{
10262#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
10263
10264 XMVECTOR Result = XMVector4Dot(N1, N2);
10265 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
10266 Result = XMVectorACosEst(Result);
10267 return Result;
10268
10269#else // _XM_VMX128_INTRINSICS_
10270#endif // _XM_VMX128_INTRINSICS_
10271}
10272
10273//------------------------------------------------------------------------------
10274
10275inline XMVECTOR XMVector4AngleBetweenNormals
10276(
10277 FXMVECTOR N1,
10278 FXMVECTOR N2
10279)
10280{
10281#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
10282
10283 XMVECTOR Result = XMVector4Dot(N1, N2);
10284 Result = XMVectorClamp(Result, g_XMNegativeOne.v, g_XMOne.v);
10285 Result = XMVectorACos(Result);
10286 return Result;
10287
10288#else // _XM_VMX128_INTRINSICS_
10289#endif // _XM_VMX128_INTRINSICS_
10290}
10291
10292//------------------------------------------------------------------------------
10293
10294inline XMVECTOR XMVector4AngleBetweenVectors
10295(
10296 FXMVECTOR V1,
10297 FXMVECTOR V2
10298)
10299{
10300#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(_XM_ARM_NEON_INTRINSICS_)
10301
10302 XMVECTOR L1 = XMVector4ReciprocalLength(V1);
10303 XMVECTOR L2 = XMVector4ReciprocalLength(V2);
10304
10305 XMVECTOR Dot = XMVector4Dot(V1, V2);
10306
10307 L1 = XMVectorMultiply(L1, L2);
10308
10309 XMVECTOR CosAngle = XMVectorMultiply(Dot, L1);
10310 CosAngle = XMVectorClamp(CosAngle, g_XMNegativeOne.v, g_XMOne.v);
10311
10312 return XMVectorACos(CosAngle);
10313
10314#else // _XM_VMX128_INTRINSICS_
10315#endif // _XM_VMX128_INTRINSICS_
10316}
10317
10318//------------------------------------------------------------------------------
10319
10320inline XMVECTOR XMVector4Transform
10321(
10322 FXMVECTOR V,
10323 CXMMATRIX M
10324)
10325{
10326#if defined(_XM_NO_INTRINSICS_)
10327 float fX = (M.m[0][0]*V.vector4_f32[0])+(M.m[1][0]*V.vector4_f32[1])+(M.m[2][0]*V.vector4_f32[2])+(M.m[3][0]*V.vector4_f32[3]);
10328 float fY = (M.m[0][1]*V.vector4_f32[0])+(M.m[1][1]*V.vector4_f32[1])+(M.m[2][1]*V.vector4_f32[2])+(M.m[3][1]*V.vector4_f32[3]);
10329 float fZ = (M.m[0][2]*V.vector4_f32[0])+(M.m[1][2]*V.vector4_f32[1])+(M.m[2][2]*V.vector4_f32[2])+(M.m[3][2]*V.vector4_f32[3]);
10330 float fW = (M.m[0][3]*V.vector4_f32[0])+(M.m[1][3]*V.vector4_f32[1])+(M.m[2][3]*V.vector4_f32[2])+(M.m[3][3]*V.vector4_f32[3]);
10331 XMVECTOR vResult = {
10332 fX,
10333 fY,
10334 fZ,
10335 fW
10336 };
10337 return vResult;
10338
10339#elif defined(_XM_ARM_NEON_INTRINSICS_)
10340 __n64 VL = vget_low_f32( V );
10341 XMVECTOR vTemp1 = vdupq_lane_f32( VL, 0 ); // X
10342 XMVECTOR vTemp2 = vdupq_lane_f32( VL, 1 ); // Y
10343 XMVECTOR vResult = vmulq_f32( vTemp1, M.r[0] );
10344 vResult = vmlaq_f32( vResult, vTemp2, M.r[1] );
10345 __n64 VH = vget_high_f32( V );
10346 vTemp1 = vdupq_lane_f32( VH, 0 ); // Z
10347 vTemp2 = vdupq_lane_f32( VH, 1 ); // W
10348 vResult = vmlaq_f32( vResult, vTemp1, M.r[2] );
10349 return vmlaq_f32( vResult, vTemp2, M.r[3] );
10350#elif defined(_XM_SSE_INTRINSICS_)
10351 // Splat x,y,z and w
10352 XMVECTOR vTempX = XM_PERMUTE_PS(V,_MM_SHUFFLE(0,0,0,0));
10353 XMVECTOR vTempY = XM_PERMUTE_PS(V,_MM_SHUFFLE(1,1,1,1));
10354 XMVECTOR vTempZ = XM_PERMUTE_PS(V,_MM_SHUFFLE(2,2,2,2));
10355 XMVECTOR vTempW = XM_PERMUTE_PS(V,_MM_SHUFFLE(3,3,3,3));
10356 // Mul by the matrix
10357 vTempX = _mm_mul_ps(vTempX,M.r[0]);
10358 vTempY = _mm_mul_ps(vTempY,M.r[1]);
10359 vTempZ = _mm_mul_ps(vTempZ,M.r[2]);
10360 vTempW = _mm_mul_ps(vTempW,M.r[3]);
10361 // Add them all together
10362 vTempX = _mm_add_ps(vTempX,vTempY);
10363 vTempZ = _mm_add_ps(vTempZ,vTempW);
10364 vTempX = _mm_add_ps(vTempX,vTempZ);
10365 return vTempX;
10366#else // _XM_VMX128_INTRINSICS_
10367#endif // _XM_VMX128_INTRINSICS_
10368}
10369
10370//------------------------------------------------------------------------------
10371_Use_decl_annotations_
10372inline XMFLOAT4* XMVector4TransformStream
10373(
10374 XMFLOAT4* pOutputStream,
10375 size_t OutputStride,
10376 const XMFLOAT4* pInputStream,
10377 size_t InputStride,
10378 size_t VectorCount,
10379 CXMMATRIX M
10380)
10381{
10382 assert(pOutputStream != NULL);
10383 assert(pInputStream != NULL);
10384
10385#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_ARM_NEON_INTRINSICS_)
10386
10387 const uint8_t* pInputVector = (const uint8_t*)pInputStream;
10388 uint8_t* pOutputVector = (uint8_t*)pOutputStream;
10389
10390 const XMVECTOR row0 = M.r[0];
10391 const XMVECTOR row1 = M.r[1];
10392 const XMVECTOR row2 = M.r[2];
10393 const XMVECTOR row3 = M.r[3];
10394
10395 for (size_t i = 0; i < VectorCount; i++)
10396 {
10397 XMVECTOR V = XMLoadFloat4((const XMFLOAT4*)pInputVector);
10398 XMVECTOR W = XMVectorSplatW(V);
10399 XMVECTOR Z = XMVectorSplatZ(V);
10400 XMVECTOR Y = XMVectorSplatY(V);
10401 XMVECTOR X = XMVectorSplatX(V);
10402
10403 XMVECTOR Result = XMVectorMultiply(W, row3);
10404 Result = XMVectorMultiplyAdd(Z, row2, Result);
10405 Result = XMVectorMultiplyAdd(Y, row1, Result);
10406 Result = XMVectorMultiplyAdd(X, row0, Result);
10407
10408 XMStoreFloat4((XMFLOAT4*)pOutputVector, Result);
10409
10410 pInputVector += InputStride;
10411 pOutputVector += OutputStride;
10412 }
10413
10414 return pOutputStream;
10415
10416#else // _XM_VMX128_INTRINSICS_
10417#endif // _XM_VMX128_INTRINSICS_
10418}
10419
10420/****************************************************************************
10421 *
10422 * XMVECTOR operators
10423 *
10424 ****************************************************************************/
10425
10426//------------------------------------------------------------------------------
10427
10428inline XMVECTOR operator+ (FXMVECTOR V)
10429{
10430 return V;
10431}
10432
10433//------------------------------------------------------------------------------
10434
10435inline XMVECTOR operator- (FXMVECTOR V)
10436{
10437 return XMVectorNegate(V);
10438}
10439
10440//------------------------------------------------------------------------------
10441
10442inline XMVECTOR& operator+=
10443(
10444 XMVECTOR& V1,
10445 FXMVECTOR V2
10446)
10447{
10448 V1 = XMVectorAdd(V1, V2);
10449 return V1;
10450}
10451
10452//------------------------------------------------------------------------------
10453
10454inline XMVECTOR& operator-=
10455(
10456 XMVECTOR& V1,
10457 FXMVECTOR V2
10458)
10459{
10460 V1 = XMVectorSubtract(V1, V2);
10461 return V1;
10462}
10463
10464//------------------------------------------------------------------------------
10465
10466inline XMVECTOR& operator*=
10467(
10468 XMVECTOR& V1,
10469 FXMVECTOR V2
10470)
10471{
10472 V1 = XMVectorMultiply(V1, V2);
10473 return V1;
10474}
10475
10476//------------------------------------------------------------------------------
10477
10478inline XMVECTOR& operator/=
10479(
10480 XMVECTOR& V1,
10481 FXMVECTOR V2
10482)
10483{
10484 V1 = XMVectorDivide(V1,V2);
10485 return V1;
10486}
10487
10488//------------------------------------------------------------------------------
10489
10490inline XMVECTOR& operator*=
10491(
10492 XMVECTOR& V,
10493 const float S
10494)
10495{
10496 V = XMVectorScale(V, S);
10497 return V;
10498}
10499
10500//------------------------------------------------------------------------------
10501
10502inline XMVECTOR& operator/=
10503(
10504 XMVECTOR& V,
10505 const float S
10506)
10507{
10508 assert( S != 0.0f );
10509 V = XMVectorScale(V, 1.0f / S);
10510 return V;
10511}
10512
10513//------------------------------------------------------------------------------
10514
10515inline XMVECTOR operator+
10516(
10517 FXMVECTOR V1,
10518 FXMVECTOR V2
10519)
10520{
10521 return XMVectorAdd(V1, V2);
10522}
10523
10524//------------------------------------------------------------------------------
10525
10526inline XMVECTOR operator-
10527(
10528 FXMVECTOR V1,
10529 FXMVECTOR V2
10530)
10531{
10532 return XMVectorSubtract(V1, V2);
10533}
10534
10535//------------------------------------------------------------------------------
10536
10537inline XMVECTOR operator*
10538(
10539 FXMVECTOR V1,
10540 FXMVECTOR V2
10541)
10542{
10543 return XMVectorMultiply(V1, V2);
10544}
10545
10546//------------------------------------------------------------------------------
10547
10548inline XMVECTOR operator/
10549(
10550 FXMVECTOR V1,
10551 FXMVECTOR V2
10552)
10553{
10554 return XMVectorDivide(V1,V2);
10555}
10556
10557//------------------------------------------------------------------------------
10558
10559inline XMVECTOR operator*
10560(
10561 FXMVECTOR V,
10562 const float S
10563)
10564{
10565 return XMVectorScale(V, S);
10566}
10567
10568//------------------------------------------------------------------------------
10569
10570inline XMVECTOR operator/
10571(
10572 FXMVECTOR V,
10573 const float S
10574)
10575{
10576 assert( S != 0.0f );
10577 return XMVectorScale(V, 1.0f / S);
10578}
10579
10580//------------------------------------------------------------------------------
10581
10582inline XMVECTOR operator*
10583(
10584 float S,
10585 FXMVECTOR V
10586)
10587{
10588 return XMVectorScale(V, S);
10589}
10590
10591#if defined(_XM_NO_INTRINSICS_)
10592#undef XMISNAN
10593#undef XMISINF
10594#endif
10595
10596