summaryrefslogtreecommitdiff
path: root/thirdparty/bullet/Bullet3Common/b3Vector3.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/bullet/Bullet3Common/b3Vector3.cpp')
-rw-r--r--thirdparty/bullet/Bullet3Common/b3Vector3.cpp2986
1 files changed, 1496 insertions, 1490 deletions
diff --git a/thirdparty/bullet/Bullet3Common/b3Vector3.cpp b/thirdparty/bullet/Bullet3Common/b3Vector3.cpp
index 5f5ac4ac04..100fb774c1 100644
--- a/thirdparty/bullet/Bullet3Common/b3Vector3.cpp
+++ b/thirdparty/bullet/Bullet3Common/b3Vector3.cpp
@@ -14,274 +14,281 @@
This source version has been altered.
*/
-#if defined (_WIN32) || defined (__i386__)
+#if defined(_WIN32) || defined(__i386__)
#define B3_USE_SSE_IN_API
#endif
#include "b3Vector3.h"
-#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
+#if defined(B3_USE_SSE) || defined(B3_USE_NEON)
#ifdef __APPLE__
#include <stdint.h>
-typedef float float4 __attribute__ ((vector_size(16)));
+typedef float float4 __attribute__((vector_size(16)));
#else
#define float4 __m128
#endif
//typedef uint32_t uint4 __attribute__ ((vector_size(16)));
-
#if defined B3_USE_SSE || defined _WIN32
-#define LOG2_ARRAY_SIZE 6
-#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
+#define LOG2_ARRAY_SIZE 6
+#define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
#include <emmintrin.h>
-long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
-long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
+long b3_maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
{
- const float4 *vertices = (const float4*) vv;
- static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
- float4 dotMax = b3Assign128( -B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY );
- float4 vvec = _mm_loadu_ps( vec );
- float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa )); /// zzzz
- float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
+ const float4 *vertices = (const float4 *)vv;
+ static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
+ float4 dotMax = b3Assign128(-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY);
+ float4 vvec = _mm_loadu_ps(vec);
+ float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa)); /// zzzz
+ float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
- long maxIndex = -1L;
+ long maxIndex = -1L;
- size_t segment = 0;
- float4 stack_array[ STACK_ARRAY_COUNT ];
+ size_t segment = 0;
+ float4 stack_array[STACK_ARRAY_COUNT];
#if DEBUG
- // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+ // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
#endif
- size_t index;
- float4 max;
- // Faster loop without cleanup code for full tiles
- for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
- {
- max = dotMax;
-
- for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
- }
-
- // If we found a new max
- if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
- {
- // copy the new max across all lanes of our max accumulator
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-
- dotMax = max;
-
- // find first occurrence of that max
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
- {}
- // record where it is.
- maxIndex = 4*index + segment + indexTable[test];
- }
- }
-
- // account for work we've already done
- count -= segment;
-
- // Deal with the last < STACK_ARRAY_COUNT vectors
- max = dotMax;
- index = 0;
-
-
- if( b3Unlikely( count > 16) )
- {
- for( ; index + 4 <= count / 4; index+=4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
- }
- }
-
- size_t localCount = (count & -4L) - 4*index;
- if( localCount )
- {
+ size_t index;
+ float4 max;
+ // Faster loop without cleanup code for full tiles
+ for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
+ {
+ max = dotMax;
+
+ for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
+ { // do four dot products at a time. Carefully avoid touching the w element.
+ float4 v0 = vertices[0];
+ float4 v1 = vertices[1];
+ float4 v2 = vertices[2];
+ float4 v3 = vertices[3];
+ vertices += 4;
+
+ float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 1] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 2] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 3] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
+ }
+
+ // If we found a new max
+ if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
+ {
+ // copy the new max across all lanes of our max accumulator
+ max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
+ max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
+
+ dotMax = max;
+
+ // find first occurrence of that max
+ size_t test;
+ for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
+ {
+ }
+ // record where it is.
+ maxIndex = 4 * index + segment + indexTable[test];
+ }
+ }
+
+ // account for work we've already done
+ count -= segment;
+
+ // Deal with the last < STACK_ARRAY_COUNT vectors
+ max = dotMax;
+ index = 0;
+
+ if (b3Unlikely(count > 16))
+ {
+ for (; index + 4 <= count / 4; index += 4)
+ { // do four dot products at a time. Carefully avoid touching the w element.
+ float4 v0 = vertices[0];
+ float4 v1 = vertices[1];
+ float4 v2 = vertices[2];
+ float4 v3 = vertices[3];
+ vertices += 4;
+
+ float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 1] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 2] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 3] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+
+ // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
+ }
+ }
+
+ size_t localCount = (count & -4L) - 4 * index;
+ if (localCount)
+ {
#ifdef __APPLE__
- float4 t0, t1, t2, t3, t4;
- float4 * sap = &stack_array[index + localCount / 4];
- vertices += localCount; // counter the offset
- size_t byteIndex = -(localCount) * sizeof(float);
- //AT&T Code style assembly
- asm volatile
- ( ".align 4 \n\
+ float4 t0, t1, t2, t3, t4;
+ float4 *sap = &stack_array[index + localCount / 4];
+ vertices += localCount; // counter the offset
+ size_t byteIndex = -(localCount) * sizeof(float);
+ //AT&T Code style assembly
+ asm volatile(
+ ".align 4 \n\
0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
@@ -307,369 +314,375 @@ long b3_maxdot_large( const float *vv, const float *vec, unsigned long count, fl
add $16, %[byteIndex] // advance loop counter\n\
jnz 0b \n\
"
- : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
- : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
- : "memory", "cc"
- );
- index += localCount/4;
+ : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
+ : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
+ : "memory", "cc");
+ index += localCount / 4;
#else
- {
- for( unsigned int i=0; i<localCount/4; i++,index++)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
- }
- }
-#endif //__APPLE__
- }
-
- // process the last few points
- if( count & 3 )
- {
- float4 v0, v1, v2, x, y, z;
- switch( count & 3 )
- {
- case 3:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
-
- // Calculate 3 dot products, transpose, duplicate v2
- float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
- float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
- lo0 = lo0*vLo;
- z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
- z = z*vHi;
- float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
- lo1 = lo1*vLo;
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- }
- break;
- case 2:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- float4 xy = _mm_movelh_ps(v0, v1);
- z = _mm_movehl_ps(v1, v0);
- xy = xy*vLo;
- z = _mm_shuffle_ps( z, z, 0xa8);
- x = _mm_shuffle_ps( xy, xy, 0xa8);
- y = _mm_shuffle_ps( xy, xy, 0xfd);
- z = z*vHi;
- }
- break;
- case 1:
- {
- float4 xy = vertices[0];
- z = _mm_shuffle_ps( xy, xy, 0xaa);
- xy = xy*vLo;
- z = z*vHi;
- x = _mm_shuffle_ps(xy, xy, 0);
- y = _mm_shuffle_ps(xy, xy, 0x55);
- }
- break;
- }
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
- index++;
- }
-
- // if we found a new max.
- if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
- { // we found a new max. Search for it
- // find max across the max vector, place in all elements of max -- big latency hit here
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-
- // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
- // this where it actually makes a difference is handled in the early out at the top of the function,
- // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
- // complexity, and removed it.
-
- dotMax = max;
-
- // scan for the first occurence of max in the array
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
- {}
- maxIndex = 4*index + segment + indexTable[test];
- }
-
- _mm_store_ss( dotResult, dotMax);
- return maxIndex;
+ {
+ for (unsigned int i = 0; i < localCount / 4; i++, index++)
+ { // do four dot products at a time. Carefully avoid touching the w element.
+ float4 v0 = vertices[0];
+ float4 v1 = vertices[1];
+ float4 v2 = vertices[2];
+ float4 v3 = vertices[3];
+ vertices += 4;
+
+ float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+ }
+ }
+#endif //__APPLE__
+ }
+
+ // process the last few points
+ if (count & 3)
+ {
+ float4 v0, v1, v2, x, y, z;
+ switch (count & 3)
+ {
+ case 3:
+ {
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+
+ // Calculate 3 dot products, transpose, duplicate v2
+ float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
+ lo0 = lo0 * vLo;
+ z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
+ z = z * vHi;
+ float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
+ lo1 = lo1 * vLo;
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ }
+ break;
+ case 2:
+ {
+ v0 = vertices[0];
+ v1 = vertices[1];
+ float4 xy = _mm_movelh_ps(v0, v1);
+ z = _mm_movehl_ps(v1, v0);
+ xy = xy * vLo;
+ z = _mm_shuffle_ps(z, z, 0xa8);
+ x = _mm_shuffle_ps(xy, xy, 0xa8);
+ y = _mm_shuffle_ps(xy, xy, 0xfd);
+ z = z * vHi;
+ }
+ break;
+ case 1:
+ {
+ float4 xy = vertices[0];
+ z = _mm_shuffle_ps(xy, xy, 0xaa);
+ xy = xy * vLo;
+ z = z * vHi;
+ x = _mm_shuffle_ps(xy, xy, 0);
+ y = _mm_shuffle_ps(xy, xy, 0x55);
+ }
+ break;
+ }
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
+ index++;
+ }
+
+ // if we found a new max.
+ if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
+ { // we found a new max. Search for it
+ // find max across the max vector, place in all elements of max -- big latency hit here
+ max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
+ max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
+
+ // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+ // this where it actually makes a difference is handled in the early out at the top of the function,
+ // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+ // complexity, and removed it.
+
+ dotMax = max;
+
+ // scan for the first occurence of max in the array
+ size_t test;
+ for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
+ {
+ }
+ maxIndex = 4 * index + segment + indexTable[test];
+ }
+
+ _mm_store_ss(dotResult, dotMax);
+ return maxIndex;
}
-long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
-long b3_mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
{
- const float4 *vertices = (const float4*) vv;
- static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
+ const float4 *vertices = (const float4 *)vv;
+ static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
- float4 dotmin = b3Assign128( B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY );
- float4 vvec = _mm_loadu_ps( vec );
- float4 vHi = b3CastiTo128f(_mm_shuffle_epi32( b3CastfTo128i( vvec), 0xaa )); /// zzzz
- float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
+ float4 dotmin = b3Assign128(B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY);
+ float4 vvec = _mm_loadu_ps(vec);
+ float4 vHi = b3CastiTo128f(_mm_shuffle_epi32(b3CastfTo128i(vvec), 0xaa)); /// zzzz
+ float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
- long minIndex = -1L;
+ long minIndex = -1L;
- size_t segment = 0;
- float4 stack_array[ STACK_ARRAY_COUNT ];
+ size_t segment = 0;
+ float4 stack_array[STACK_ARRAY_COUNT];
#if DEBUG
- // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
+ // memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
#endif
- size_t index;
- float4 min;
- // Faster loop without cleanup code for full tiles
- for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
- {
- min = dotmin;
-
- for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
- }
-
- // If we found a new min
- if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
- {
- // copy the new min across all lanes of our min accumulator
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-
- dotmin = min;
-
- // find first occurrence of that min
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
- {}
- // record where it is.
- minIndex = 4*index + segment + indexTable[test];
- }
- }
-
- // account for work we've already done
- count -= segment;
-
- // Deal with the last < STACK_ARRAY_COUNT vectors
- min = dotmin;
- index = 0;
-
-
- if(b3Unlikely( count > 16) )
- {
- for( ; index + 4 <= count / 4; index+=4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
- }
- }
-
- size_t localCount = (count & -4L) - 4*index;
- if( localCount )
- {
-
-
+ size_t index;
+ float4 min;
+ // Faster loop without cleanup code for full tiles
+ for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
+ {
+ min = dotmin;
+
+ for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
+ { // do four dot products at a time. Carefully avoid touching the w element.
+ float4 v0 = vertices[0];
+ float4 v1 = vertices[1];
+ float4 v2 = vertices[2];
+ float4 v3 = vertices[3];
+ vertices += 4;
+
+ float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 1] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 2] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 3] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
+ }
+
+ // If we found a new min
+ if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
+ {
+ // copy the new min across all lanes of our min accumulator
+ min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
+ min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
+
+ dotmin = min;
+
+ // find first occurrence of that min
+ size_t test;
+ for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
+ {
+ }
+ // record where it is.
+ minIndex = 4 * index + segment + indexTable[test];
+ }
+ }
+
+ // account for work we've already done
+ count -= segment;
+
+ // Deal with the last < STACK_ARRAY_COUNT vectors
+ min = dotmin;
+ index = 0;
+
+ if (b3Unlikely(count > 16))
+ {
+ for (; index + 4 <= count / 4; index += 4)
+ { // do four dot products at a time. Carefully avoid touching the w element.
+ float4 v0 = vertices[0];
+ float4 v1 = vertices[1];
+ float4 v2 = vertices[2];
+ float4 v3 = vertices[3];
+ vertices += 4;
+
+ float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 1] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 2] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+ v3 = vertices[3];
+ vertices += 4;
+
+ lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index + 3] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+
+ // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
+ }
+ }
+
+ size_t localCount = (count & -4L) - 4 * index;
+ if (localCount)
+ {
#ifdef __APPLE__
- vertices += localCount; // counter the offset
- float4 t0, t1, t2, t3, t4;
- size_t byteIndex = -(localCount) * sizeof(float);
- float4 * sap = &stack_array[index + localCount / 4];
+ vertices += localCount; // counter the offset
+ float4 t0, t1, t2, t3, t4;
+ size_t byteIndex = -(localCount) * sizeof(float);
+ float4 *sap = &stack_array[index + localCount / 4];
- asm volatile
- ( ".align 4 \n\
+ asm volatile(
+ ".align 4 \n\
0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
@@ -695,937 +708,930 @@ long b3_mindot_large( const float *vv, const float *vec, unsigned long count, fl
add $16, %[byteIndex] // advance loop counter\n\
jnz 0b \n\
"
- : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
- : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
- : "memory", "cc"
- );
- index += localCount/4;
+ : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
+ : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
+ : "memory", "cc");
+ index += localCount / 4;
#else
- {
- for( unsigned int i=0; i<localCount/4; i++,index++)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
- }
- }
+ {
+ for (unsigned int i = 0; i < localCount / 4; i++, index++)
+ { // do four dot products at a time. Carefully avoid touching the w element.
+ float4 v0 = vertices[0];
+ float4 v1 = vertices[1];
+ float4 v2 = vertices[2];
+ float4 v3 = vertices[3];
+ vertices += 4;
+
+ float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
+ float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
+ float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
+
+ lo0 = lo0 * vLo;
+ lo1 = lo1 * vLo;
+ float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
+ float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ z = z * vHi;
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan
+ }
+ }
#endif
- }
-
- // process the last few points
- if( count & 3 )
- {
- float4 v0, v1, v2, x, y, z;
- switch( count & 3 )
- {
- case 3:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
-
- // Calculate 3 dot products, transpose, duplicate v2
- float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
- float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
- lo0 = lo0*vLo;
- z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
- z = z*vHi;
- float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
- lo1 = lo1*vLo;
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- }
- break;
- case 2:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- float4 xy = _mm_movelh_ps(v0, v1);
- z = _mm_movehl_ps(v1, v0);
- xy = xy*vLo;
- z = _mm_shuffle_ps( z, z, 0xa8);
- x = _mm_shuffle_ps( xy, xy, 0xa8);
- y = _mm_shuffle_ps( xy, xy, 0xfd);
- z = z*vHi;
- }
- break;
- case 1:
- {
- float4 xy = vertices[0];
- z = _mm_shuffle_ps( xy, xy, 0xaa);
- xy = xy*vLo;
- z = z*vHi;
- x = _mm_shuffle_ps(xy, xy, 0);
- y = _mm_shuffle_ps(xy, xy, 0x55);
- }
- break;
- }
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
- index++;
- }
-
- // if we found a new min.
- if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
- { // we found a new min. Search for it
- // find min across the min vector, place in all elements of min -- big latency hit here
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-
- // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
- // this where it actually makes a difference is handled in the early out at the top of the function,
- // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
- // complexity, and removed it.
-
- dotmin = min;
-
- // scan for the first occurence of min in the array
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
- {}
- minIndex = 4*index + segment + indexTable[test];
- }
-
- _mm_store_ss( dotResult, dotmin);
- return minIndex;
+ }
+
+ // process the last few points
+ if (count & 3)
+ {
+ float4 v0, v1, v2, x, y, z;
+ switch (count & 3)
+ {
+ case 3:
+ {
+ v0 = vertices[0];
+ v1 = vertices[1];
+ v2 = vertices[2];
+
+ // Calculate 3 dot products, transpose, duplicate v2
+ float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
+ float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
+ lo0 = lo0 * vLo;
+ z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
+ z = z * vHi;
+ float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
+ lo1 = lo1 * vLo;
+ x = _mm_shuffle_ps(lo0, lo1, 0x88);
+ y = _mm_shuffle_ps(lo0, lo1, 0xdd);
+ }
+ break;
+ case 2:
+ {
+ v0 = vertices[0];
+ v1 = vertices[1];
+ float4 xy = _mm_movelh_ps(v0, v1);
+ z = _mm_movehl_ps(v1, v0);
+ xy = xy * vLo;
+ z = _mm_shuffle_ps(z, z, 0xa8);
+ x = _mm_shuffle_ps(xy, xy, 0xa8);
+ y = _mm_shuffle_ps(xy, xy, 0xfd);
+ z = z * vHi;
+ }
+ break;
+ case 1:
+ {
+ float4 xy = vertices[0];
+ z = _mm_shuffle_ps(xy, xy, 0xaa);
+ xy = xy * vLo;
+ z = z * vHi;
+ x = _mm_shuffle_ps(xy, xy, 0);
+ y = _mm_shuffle_ps(xy, xy, 0x55);
+ }
+ break;
+ }
+ x = x + y;
+ x = x + z;
+ stack_array[index] = x;
+ min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
+ index++;
+ }
+
+ // if we found a new min.
+ if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
+ { // we found a new min. Search for it
+ // find min across the min vector, place in all elements of min -- big latency hit here
+ min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
+ min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
+
+ // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
+ // this where it actually makes a difference is handled in the early out at the top of the function,
+ // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
+ // complexity, and removed it.
+
+ dotmin = min;
+
+ // scan for the first occurence of min in the array
+ size_t test;
+ for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
+ {
+ }
+ minIndex = 4 * index + segment + indexTable[test];
+ }
+
+ _mm_store_ss(dotResult, dotmin);
+ return minIndex;
}
-
#elif defined B3_USE_NEON
-#define ARM_NEON_GCC_COMPATIBILITY 1
+#define ARM_NEON_GCC_COMPATIBILITY 1
#include <arm_neon.h>
+static long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
+static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
-static long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
-static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
+long (*b3_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_maxdot_large_sel;
+long (*b3_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = b3_mindot_large_sel;
-long (*b3_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_maxdot_large_sel;
-long (*b3_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = b3_mindot_large_sel;
-
-extern "C" {int _get_cpu_capabilities( void );}
-
-static long b3_maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+extern "C"
{
- if( _get_cpu_capabilities() & 0x2000 )
- b3_maxdot_large = _maxdot_large_v1;
- else
- b3_maxdot_large = _maxdot_large_v0;
-
- return b3_maxdot_large(vv, vec, count, dotResult);
+ int _get_cpu_capabilities(void);
}
-static long b3_mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
+static long b3_maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
{
- if( _get_cpu_capabilities() & 0x2000 )
- b3_mindot_large = _mindot_large_v1;
- else
- b3_mindot_large = _mindot_large_v0;
+ if (_get_cpu_capabilities() & 0x2000)
+ b3_maxdot_large = _maxdot_large_v1;
+ else
+ b3_maxdot_large = _maxdot_large_v0;
- return b3_mindot_large(vv, vec, count, dotResult);
+ return b3_maxdot_large(vv, vec, count, dotResult);
}
+static long b3_mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
+{
+ if (_get_cpu_capabilities() & 0x2000)
+ b3_mindot_large = _mindot_large_v1;
+ else
+ b3_mindot_large = _mindot_large_v0;
+ return b3_mindot_large(vv, vec, count, dotResult);
+}
-#define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
-
+#define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
-long b3_maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
{
- unsigned long i = 0;
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x2_t vLo = vget_low_f32(vvec);
- float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
- float32x2_t dotMaxLo = (float32x2_t) { -B3_INFINITY, -B3_INFINITY };
- float32x2_t dotMaxHi = (float32x2_t) { -B3_INFINITY, -B3_INFINITY };
- uint32x2_t indexLo = (uint32x2_t) {0, 1};
- uint32x2_t indexHi = (uint32x2_t) {2, 3};
- uint32x2_t iLo = (uint32x2_t) {-1, -1};
- uint32x2_t iHi = (uint32x2_t) {-1, -1};
- const uint32x2_t four = (uint32x2_t) {4,4};
-
- for( ; i+8 <= count; i+= 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
-
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- xy0 = vmul_f32( vget_low_f32(v0), vLo);
- xy1 = vmul_f32( vget_low_f32(v1), vLo);
- xy2 = vmul_f32( vget_low_f32(v2), vLo);
- xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- zLo = vmul_f32( z0.val[0], vHi);
- zHi = vmul_f32( z1.val[0], vHi);
-
- rLo = vpadd_f32( xy0, xy1);
- rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- maskLo = vcgt_f32( rLo, dotMaxLo );
- maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
-
- for( ; i+4 <= count; i+= 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
-
- switch( count & 3 )
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy2);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- rLo = vadd_f32(rLo, zLo);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
- float32x2_t zLo = vmul_f32( z0, vHi);
- float32x2_t rLo = vpadd_f32( xy0, xy0);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
-
- default:
- break;
- }
-
- // select best answer between hi and lo results
- uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
- dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- // select best answer between even and odd results
- dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
- iHi = vdup_lane_u32(iLo, 1);
- mask = vcgt_f32( dotMaxHi, dotMaxLo );
- dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- *dotResult = vget_lane_f32( dotMaxLo, 0);
- return vget_lane_u32(iLo, 0);
+ unsigned long i = 0;
+ float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+ float32x2_t vLo = vget_low_f32(vvec);
+ float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+ float32x2_t dotMaxLo = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
+ float32x2_t dotMaxHi = (float32x2_t){-B3_INFINITY, -B3_INFINITY};
+ uint32x2_t indexLo = (uint32x2_t){0, 1};
+ uint32x2_t indexHi = (uint32x2_t){2, 3};
+ uint32x2_t iLo = (uint32x2_t){-1, -1};
+ uint32x2_t iHi = (uint32x2_t){-1, -1};
+ const uint32x2_t four = (uint32x2_t){4, 4};
+
+ for (; i + 8 <= count; i += 8)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+ float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+ float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ float32x2_t rHi = vpadd_f32(xy2, xy3);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+ uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+ dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+ dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ indexLo = vadd_u32(indexLo, four);
+ indexHi = vadd_u32(indexHi, four);
+
+ v0 = vld1q_f32_aligned_postincrement(vv);
+ v1 = vld1q_f32_aligned_postincrement(vv);
+ v2 = vld1q_f32_aligned_postincrement(vv);
+ v3 = vld1q_f32_aligned_postincrement(vv);
+
+ xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ xy2 = vmul_f32(vget_low_f32(v2), vLo);
+ xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+ z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+ zLo = vmul_f32(z0.val[0], vHi);
+ zHi = vmul_f32(z1.val[0], vHi);
+
+ rLo = vpadd_f32(xy0, xy1);
+ rHi = vpadd_f32(xy2, xy3);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ maskLo = vcgt_f32(rLo, dotMaxLo);
+ maskHi = vcgt_f32(rHi, dotMaxHi);
+ dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+ dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ indexLo = vadd_u32(indexLo, four);
+ indexHi = vadd_u32(indexHi, four);
+ }
+
+ for (; i + 4 <= count; i += 4)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+ float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+ float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ float32x2_t rHi = vpadd_f32(xy2, xy3);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+ uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+ dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+ dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ indexLo = vadd_u32(indexLo, four);
+ indexHi = vadd_u32(indexHi, four);
+ }
+
+ switch (count & 3)
+ {
+ case 3:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+ float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ float32x2_t rHi = vpadd_f32(xy2, xy2);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+ uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
+ dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+ dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ }
+ break;
+ case 2:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ rLo = vadd_f32(rLo, zLo);
+
+ uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+ dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ }
+ break;
+ case 1:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+ float32x2_t zLo = vmul_f32(z0, vHi);
+ float32x2_t rLo = vpadd_f32(xy0, xy0);
+ rLo = vadd_f32(rLo, zLo);
+ uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
+ dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ // select best answer between hi and lo results
+ uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
+ dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+ iLo = vbsl_u32(mask, iHi, iLo);
+
+ // select best answer between even and odd results
+ dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
+ iHi = vdup_lane_u32(iLo, 1);
+ mask = vcgt_f32(dotMaxHi, dotMaxLo);
+ dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
+ iLo = vbsl_u32(mask, iHi, iLo);
+
+ *dotResult = vget_lane_f32(dotMaxLo, 0);
+ return vget_lane_u32(iLo, 0);
}
-
-long b3_maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
{
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
- float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
- const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
- uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
- uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
- float32x4_t maxDot = (float32x4_t) { -B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY };
-
- unsigned long i = 0;
- for( ; i + 8 <= count; i += 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
-
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- zb = vuzpq_f32( z0, z1);
- z = vmulq_f32( zb.val[0], vHi);
- xy = vuzpq_f32( xy0, xy1);
- x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
-
- for( ; i + 4 <= count; i += 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
-
- switch (count & 3) {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-
- xy0 = vmulq_f32(xy0, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z0);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
-
- xy0 = vmulq_f32(xy0, vLo);
-
- z = vmulq_f32( z, vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- default:
- break;
- }
-
-
- // select best answer between hi and lo results
- uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
- float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
- uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-
- // select best answer between even and odd results
- float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
- uint32x2_t indexHi = vdup_lane_u32(index2, 1);
- mask = vcgt_f32( maxDotO, maxDot2 );
- maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
- index2 = vbsl_u32(mask, indexHi, index2);
-
- *dotResult = vget_lane_f32( maxDot2, 0);
- return vget_lane_u32(index2, 0);
-
+ float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+ float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+ float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+ const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
+ uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
+ uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
+ float32x4_t maxDot = (float32x4_t){-B3_INFINITY, -B3_INFINITY, -B3_INFINITY, -B3_INFINITY};
+
+ unsigned long i = 0;
+ for (; i + 8 <= count; i += 8)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z1);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcgtq_f32(x, maxDot);
+ maxDot = vbslq_f32(mask, x, maxDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+
+ v0 = vld1q_f32_aligned_postincrement(vv);
+ v1 = vld1q_f32_aligned_postincrement(vv);
+ v2 = vld1q_f32_aligned_postincrement(vv);
+ v3 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+ // the next two lines should resolve to a single vswp d, d
+ z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ zb = vuzpq_f32(z0, z1);
+ z = vmulq_f32(zb.val[0], vHi);
+ xy = vuzpq_f32(xy0, xy1);
+ x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ mask = vcgtq_f32(x, maxDot);
+ maxDot = vbslq_f32(mask, x, maxDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+
+ for (; i + 4 <= count; i += 4)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z1);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcgtq_f32(x, maxDot);
+ maxDot = vbslq_f32(mask, x, maxDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+
+ switch (count & 3)
+ {
+ case 3:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z1);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcgtq_f32(x, maxDot);
+ maxDot = vbslq_f32(mask, x, maxDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+ break;
+
+ case 2:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+
+ xy0 = vmulq_f32(xy0, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z0);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcgtq_f32(x, maxDot);
+ maxDot = vbslq_f32(mask, x, maxDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+ break;
+
+ case 1:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
+ xy0 = vmulq_f32(xy0, vLo);
+
+ z = vmulq_f32(z, vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcgtq_f32(x, maxDot);
+ maxDot = vbslq_f32(mask, x, maxDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ // select best answer between hi and lo results
+ uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
+ float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
+ uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+
+ // select best answer between even and odd results
+ float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
+ uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+ mask = vcgt_f32(maxDotO, maxDot2);
+ maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
+ index2 = vbsl_u32(mask, indexHi, index2);
+
+ *dotResult = vget_lane_f32(maxDot2, 0);
+ return vget_lane_u32(index2, 0);
}
-long b3_mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
{
- unsigned long i = 0;
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x2_t vLo = vget_low_f32(vvec);
- float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
- float32x2_t dotMinLo = (float32x2_t) { B3_INFINITY, B3_INFINITY };
- float32x2_t dotMinHi = (float32x2_t) { B3_INFINITY, B3_INFINITY };
- uint32x2_t indexLo = (uint32x2_t) {0, 1};
- uint32x2_t indexHi = (uint32x2_t) {2, 3};
- uint32x2_t iLo = (uint32x2_t) {-1, -1};
- uint32x2_t iHi = (uint32x2_t) {-1, -1};
- const uint32x2_t four = (uint32x2_t) {4,4};
-
- for( ; i+8 <= count; i+= 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
-
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- xy0 = vmul_f32( vget_low_f32(v0), vLo);
- xy1 = vmul_f32( vget_low_f32(v1), vLo);
- xy2 = vmul_f32( vget_low_f32(v2), vLo);
- xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- zLo = vmul_f32( z0.val[0], vHi);
- zHi = vmul_f32( z1.val[0], vHi);
-
- rLo = vpadd_f32( xy0, xy1);
- rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- maskLo = vclt_f32( rLo, dotMinLo );
- maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
-
- for( ; i+4 <= count; i+= 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- switch( count & 3 )
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy2);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- rLo = vadd_f32(rLo, zLo);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
- float32x2_t zLo = vmul_f32( z0, vHi);
- float32x2_t rLo = vpadd_f32( xy0, xy0);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
-
- default:
- break;
- }
-
- // select best answer between hi and lo results
- uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
- dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- // select best answer between even and odd results
- dotMinHi = vdup_lane_f32(dotMinLo, 1);
- iHi = vdup_lane_u32(iLo, 1);
- mask = vclt_f32( dotMinHi, dotMinLo );
- dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- *dotResult = vget_lane_f32( dotMinLo, 0);
- return vget_lane_u32(iLo, 0);
+ unsigned long i = 0;
+ float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+ float32x2_t vLo = vget_low_f32(vvec);
+ float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
+ float32x2_t dotMinLo = (float32x2_t){B3_INFINITY, B3_INFINITY};
+ float32x2_t dotMinHi = (float32x2_t){B3_INFINITY, B3_INFINITY};
+ uint32x2_t indexLo = (uint32x2_t){0, 1};
+ uint32x2_t indexHi = (uint32x2_t){2, 3};
+ uint32x2_t iLo = (uint32x2_t){-1, -1};
+ uint32x2_t iHi = (uint32x2_t){-1, -1};
+ const uint32x2_t four = (uint32x2_t){4, 4};
+
+ for (; i + 8 <= count; i += 8)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+ float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+ float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ float32x2_t rHi = vpadd_f32(xy2, xy3);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+ uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+ dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+ dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ indexLo = vadd_u32(indexLo, four);
+ indexHi = vadd_u32(indexHi, four);
+
+ v0 = vld1q_f32_aligned_postincrement(vv);
+ v1 = vld1q_f32_aligned_postincrement(vv);
+ v2 = vld1q_f32_aligned_postincrement(vv);
+ v3 = vld1q_f32_aligned_postincrement(vv);
+
+ xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ xy2 = vmul_f32(vget_low_f32(v2), vLo);
+ xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+ z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+ zLo = vmul_f32(z0.val[0], vHi);
+ zHi = vmul_f32(z1.val[0], vHi);
+
+ rLo = vpadd_f32(xy0, xy1);
+ rHi = vpadd_f32(xy2, xy3);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ maskLo = vclt_f32(rLo, dotMinLo);
+ maskHi = vclt_f32(rHi, dotMinHi);
+ dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+ dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ indexLo = vadd_u32(indexLo, four);
+ indexHi = vadd_u32(indexHi, four);
+ }
+
+ for (; i + 4 <= count; i += 4)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+ float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+ float32x2_t zHi = vmul_f32(z1.val[0], vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ float32x2_t rHi = vpadd_f32(xy2, xy3);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+ uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+ dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+ dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ indexLo = vadd_u32(indexLo, four);
+ indexHi = vadd_u32(indexHi, four);
+ }
+ switch (count & 3)
+ {
+ case 3:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+ float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+ float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ float32x2_t rHi = vpadd_f32(xy2, xy2);
+ rLo = vadd_f32(rLo, zLo);
+ rHi = vadd_f32(rHi, zHi);
+
+ uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+ uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
+ dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+ dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ iHi = vbsl_u32(maskHi, indexHi, iHi);
+ }
+ break;
+ case 2:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
+
+ float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x2_t zLo = vmul_f32(z0.val[0], vHi);
+
+ float32x2_t rLo = vpadd_f32(xy0, xy1);
+ rLo = vadd_f32(rLo, zLo);
+
+ uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+ dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ }
+ break;
+ case 1:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
+ float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
+ float32x2_t zLo = vmul_f32(z0, vHi);
+ float32x2_t rLo = vpadd_f32(xy0, xy0);
+ rLo = vadd_f32(rLo, zLo);
+ uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
+ dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
+ iLo = vbsl_u32(maskLo, indexLo, iLo);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ // select best answer between hi and lo results
+ uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
+ dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+ iLo = vbsl_u32(mask, iHi, iLo);
+
+ // select best answer between even and odd results
+ dotMinHi = vdup_lane_f32(dotMinLo, 1);
+ iHi = vdup_lane_u32(iLo, 1);
+ mask = vclt_f32(dotMinHi, dotMinLo);
+ dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
+ iLo = vbsl_u32(mask, iHi, iLo);
+
+ *dotResult = vget_lane_f32(dotMinLo, 0);
+ return vget_lane_u32(iLo, 0);
}
-long b3_mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
+long b3_mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
{
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
- float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
- const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
- uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
- uint32x4_t index = (uint32x4_t) { -1, -1, -1, -1 };
- float32x4_t minDot = (float32x4_t) { B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY };
-
- unsigned long i = 0;
- for( ; i + 8 <= count; i += 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
-
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- zb = vuzpq_f32( z0, z1);
- z = vmulq_f32( zb.val[0], vHi);
- xy = vuzpq_f32( xy0, xy1);
- x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
-
- for( ; i + 4 <= count; i += 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
-
- switch (count & 3) {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-
- xy0 = vmulq_f32(xy0, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z0);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
-
- xy0 = vmulq_f32(xy0, vLo);
-
- z = vmulq_f32( z, vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- default:
- break;
- }
-
-
- // select best answer between hi and lo results
- uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
- float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
- uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-
- // select best answer between even and odd results
- float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
- uint32x2_t indexHi = vdup_lane_u32(index2, 1);
- mask = vclt_f32( minDotO, minDot2 );
- minDot2 = vbsl_f32(mask, minDotO, minDot2);
- index2 = vbsl_u32(mask, indexHi, index2);
-
- *dotResult = vget_lane_f32( minDot2, 0);
- return vget_lane_u32(index2, 0);
-
+ float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
+ float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
+ float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
+ const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
+ uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
+ uint32x4_t index = (uint32x4_t){-1, -1, -1, -1};
+ float32x4_t minDot = (float32x4_t){B3_INFINITY, B3_INFINITY, B3_INFINITY, B3_INFINITY};
+
+ unsigned long i = 0;
+ for (; i + 8 <= count; i += 8)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z1);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcltq_f32(x, minDot);
+ minDot = vbslq_f32(mask, x, minDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+
+ v0 = vld1q_f32_aligned_postincrement(vv);
+ v1 = vld1q_f32_aligned_postincrement(vv);
+ v2 = vld1q_f32_aligned_postincrement(vv);
+ v3 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+ // the next two lines should resolve to a single vswp d, d
+ z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ zb = vuzpq_f32(z0, z1);
+ z = vmulq_f32(zb.val[0], vHi);
+ xy = vuzpq_f32(xy0, xy1);
+ x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ mask = vcltq_f32(x, minDot);
+ minDot = vbslq_f32(mask, x, minDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+
+ for (; i + 4 <= count; i += 4)
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z1);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcltq_f32(x, minDot);
+ minDot = vbslq_f32(mask, x, minDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+
+ switch (count & 3)
+ {
+ case 3:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+ float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
+
+ xy0 = vmulq_f32(xy0, vLo);
+ xy1 = vmulq_f32(xy1, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z1);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy1);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcltq_f32(x, minDot);
+ minDot = vbslq_f32(mask, x, minDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+ break;
+
+ case 2:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+ float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
+
+ xy0 = vmulq_f32(xy0, vLo);
+
+ float32x4x2_t zb = vuzpq_f32(z0, z0);
+ float32x4_t z = vmulq_f32(zb.val[0], vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcltq_f32(x, minDot);
+ minDot = vbslq_f32(mask, x, minDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+ break;
+
+ case 1:
+ {
+ float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
+
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
+ // the next two lines should resolve to a single vswp d, d
+ float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
+
+ xy0 = vmulq_f32(xy0, vLo);
+
+ z = vmulq_f32(z, vHi);
+ float32x4x2_t xy = vuzpq_f32(xy0, xy0);
+ float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
+ x = vaddq_f32(x, z);
+
+ uint32x4_t mask = vcltq_f32(x, minDot);
+ minDot = vbslq_f32(mask, x, minDot);
+ index = vbslq_u32(mask, local_index, index);
+ local_index = vaddq_u32(local_index, four);
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ // select best answer between hi and lo results
+ uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
+ float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
+ uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
+
+ // select best answer between even and odd results
+ float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
+ uint32x2_t indexHi = vdup_lane_u32(index2, 1);
+ mask = vclt_f32(minDotO, minDot2);
+ minDot2 = vbsl_f32(mask, minDotO, minDot2);
+ index2 = vbsl_u32(mask, indexHi, index2);
+
+ *dotResult = vget_lane_f32(minDot2, 0);
+ return vget_lane_u32(index2, 0);
}
#else
- #error Unhandled __APPLE__ arch
+#error Unhandled __APPLE__ arch
#endif
-#endif /* __APPLE__ */
-
-
+#endif /* __APPLE__ */