1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
|
// Copyright 2009-2021 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
#define vboolf vboolf_impl
#define vboold vboold_impl
#define vint vint_impl
#define vuint vuint_impl
#define vllong vllong_impl
#define vfloat vfloat_impl
#define vdouble vdouble_impl
namespace embree
{
/* 8-wide AVX-512 64-bit double type */
template<>
struct vdouble<8>
{
ALIGNED_STRUCT_(64);
typedef vboold8 Bool;
enum { size = 8 }; // number of SIMD elements
union { // data
__m512d v;
double i[8];
};
////////////////////////////////////////////////////////////////////////////////
/// Constructors, Assignment & Cast Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble() {}
__forceinline vdouble(const vdouble8& t) { v = t.v; }
__forceinline vdouble8& operator =(const vdouble8& f) { v = f.v; return *this; }
__forceinline vdouble(const __m512d& t) { v = t; }
__forceinline operator __m512d() const { return v; }
__forceinline operator __m256d() const { return _mm512_castpd512_pd256(v); }
__forceinline vdouble(double i) {
v = _mm512_set1_pd(i);
}
__forceinline vdouble(double a, double b, double c, double d) {
v = _mm512_set4_pd(d,c,b,a);
}
__forceinline vdouble(double a0, double a1, double a2, double a3,
double a4, double a5, double a6, double a7)
{
v = _mm512_set_pd(a7,a6,a5,a4,a3,a2,a1,a0);
}
////////////////////////////////////////////////////////////////////////////////
/// Constants
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble(ZeroTy) : v(_mm512_setzero_pd()) {}
__forceinline vdouble(OneTy) : v(_mm512_set1_pd(1)) {}
__forceinline vdouble(StepTy) : v(_mm512_set_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
__forceinline vdouble(ReverseStepTy) : v(_mm512_setr_pd(7.0,6.0,5.0,4.0,3.0,2.0,1.0,0.0)) {}
////////////////////////////////////////////////////////////////////////////////
/// Loads and Stores
////////////////////////////////////////////////////////////////////////////////
static __forceinline void store_nt(void *__restrict__ ptr, const vdouble8& a) {
_mm512_stream_pd((double*)ptr, a);
}
static __forceinline vdouble8 loadu(const void* addr) {
return _mm512_loadu_pd((double*)addr);
}
static __forceinline vdouble8 load(const vdouble8* addr) {
return _mm512_load_pd((double*)addr);
}
static __forceinline vdouble8 load(const double* addr) {
return _mm512_load_pd(addr);
}
static __forceinline void store(void* ptr, const vdouble8& v) {
_mm512_store_pd(ptr, v);
}
static __forceinline void storeu(void* ptr, const vdouble8& v) {
_mm512_storeu_pd(ptr, v);
}
static __forceinline void storeu(const vboold8& mask, double* ptr, const vdouble8& f) {
_mm512_mask_storeu_pd(ptr, mask, f);
}
static __forceinline void store(const vboold8& mask, void* addr, const vdouble8& v2) {
_mm512_mask_store_pd(addr, mask, v2);
}
static __forceinline vdouble8 compact(const vboold8& mask, vdouble8& v) {
return _mm512_mask_compress_pd(v, mask, v);
}
static __forceinline vdouble8 compact(const vboold8& mask, const vdouble8& a, vdouble8& b) {
return _mm512_mask_compress_pd(a, mask, b);
}
static __forceinline vdouble8 broadcast(const void* a) { return _mm512_set1_pd(*(double*)a); }
////////////////////////////////////////////////////////////////////////////////
/// Array Access
////////////////////////////////////////////////////////////////////////////////
__forceinline double& operator [](size_t index) { assert(index < 8); return i[index]; }
__forceinline const double& operator [](size_t index) const { assert(index < 8); return i[index]; }
};
////////////////////////////////////////////////////////////////////////////////
/// Unary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 asDouble(const vllong8& a) { return _mm512_castsi512_pd(a); }
__forceinline vllong8 asLLong (const vdouble8& a) { return _mm512_castpd_si512(a); }
__forceinline vdouble8 operator +(const vdouble8& a) { return a; }
__forceinline vdouble8 operator -(const vdouble8& a) { return _mm512_sub_pd(_mm512_setzero_pd(), a); }
////////////////////////////////////////////////////////////////////////////////
/// Binary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 operator +(const vdouble8& a, const vdouble8& b) { return _mm512_add_pd(a, b); }
__forceinline vdouble8 operator +(const vdouble8& a, double b) { return a + vdouble8(b); }
__forceinline vdouble8 operator +(double a, const vdouble8& b) { return vdouble8(a) + b; }
__forceinline vdouble8 operator -(const vdouble8& a, const vdouble8& b) { return _mm512_sub_pd(a, b); }
__forceinline vdouble8 operator -(const vdouble8& a, double b) { return a - vdouble8(b); }
__forceinline vdouble8 operator -(double a, const vdouble8& b) { return vdouble8(a) - b; }
__forceinline vdouble8 operator *(const vdouble8& a, const vdouble8& b) { return _mm512_mul_pd(a, b); }
__forceinline vdouble8 operator *(const vdouble8& a, double b) { return a * vdouble8(b); }
__forceinline vdouble8 operator *(double a, const vdouble8& b) { return vdouble8(a) * b; }
__forceinline vdouble8 operator &(const vdouble8& a, const vdouble8& b) { return _mm512_and_pd(a, b); }
__forceinline vdouble8 operator &(const vdouble8& a, double b) { return a & vdouble8(b); }
__forceinline vdouble8 operator &(double a, const vdouble8& b) { return vdouble8(a) & b; }
__forceinline vdouble8 operator |(const vdouble8& a, const vdouble8& b) { return _mm512_or_pd(a, b); }
__forceinline vdouble8 operator |(const vdouble8& a, double b) { return a | vdouble8(b); }
__forceinline vdouble8 operator |(double a, const vdouble8& b) { return vdouble8(a) | b; }
__forceinline vdouble8 operator ^(const vdouble8& a, const vdouble8& b) { return _mm512_xor_pd(a, b); }
__forceinline vdouble8 operator ^(const vdouble8& a, double b) { return a ^ vdouble8(b); }
__forceinline vdouble8 operator ^(double a, const vdouble8& b) { return vdouble8(a) ^ b; }
__forceinline vdouble8 operator <<(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 operator >>(const vdouble8& a, const unsigned int n) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 operator <<(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_sllv_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 operator >>(const vdouble8& a, const vllong8& n) { return _mm512_castsi512_pd(_mm512_srav_epi64(_mm512_castpd_si512(a), n)); }
__forceinline vdouble8 sll (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_slli_epi64(_mm512_castpd_si512(a), b)); }
__forceinline vdouble8 sra (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), b)); }
__forceinline vdouble8 srl (const vdouble8& a, const unsigned int b) { return _mm512_castsi512_pd(_mm512_srli_epi64(_mm512_castpd_si512(a), b)); }
__forceinline vdouble8 min(const vdouble8& a, const vdouble8& b) { return _mm512_min_pd(a, b); }
__forceinline vdouble8 min(const vdouble8& a, double b) { return min(a,vdouble8(b)); }
__forceinline vdouble8 min(double a, const vdouble8& b) { return min(vdouble8(a),b); }
__forceinline vdouble8 max(const vdouble8& a, const vdouble8& b) { return _mm512_max_pd(a, b); }
__forceinline vdouble8 max(const vdouble8& a, double b) { return max(a,vdouble8(b)); }
__forceinline vdouble8 max(double a, const vdouble8& b) { return max(vdouble8(a),b); }
__forceinline vdouble8 mask_add(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_add_pd(c,mask,a,b); }
__forceinline vdouble8 mask_sub(const vboold8& mask, vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_sub_pd(c,mask,a,b); }
__forceinline vdouble8 mask_and(const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_and_pd(c,m,a,b); }
__forceinline vdouble8 mask_or (const vboold8& m,vdouble8& c, const vdouble8& a, const vdouble8& b) { return _mm512_mask_or_pd(c,m,a,b); }
////////////////////////////////////////////////////////////////////////////////
/// Ternary Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 madd (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmadd_pd(a,b,c); }
__forceinline vdouble8 msub (const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fmsub_pd(a,b,c); }
__forceinline vdouble8 nmadd(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmadd_pd(a,b,c); }
__forceinline vdouble8 nmsub(const vdouble8& a, const vdouble8& b, const vdouble8& c) { return _mm512_fnmsub_pd(a,b,c); }
////////////////////////////////////////////////////////////////////////////////
/// Assignment Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8& operator +=(vdouble8& a, const vdouble8& b) { return a = a + b; }
__forceinline vdouble8& operator +=(vdouble8& a, double b) { return a = a + b; }
__forceinline vdouble8& operator -=(vdouble8& a, const vdouble8& b) { return a = a - b; }
__forceinline vdouble8& operator -=(vdouble8& a, double b) { return a = a - b; }
__forceinline vdouble8& operator *=(vdouble8& a, const vdouble8& b) { return a = a * b; }
__forceinline vdouble8& operator *=(vdouble8& a, double b) { return a = a * b; }
__forceinline vdouble8& operator &=(vdouble8& a, const vdouble8& b) { return a = a & b; }
__forceinline vdouble8& operator &=(vdouble8& a, double b) { return a = a & b; }
__forceinline vdouble8& operator |=(vdouble8& a, const vdouble8& b) { return a = a | b; }
__forceinline vdouble8& operator |=(vdouble8& a, double b) { return a = a | b; }
__forceinline vdouble8& operator <<=(vdouble8& a, const double b) { return a = a << b; }
__forceinline vdouble8& operator >>=(vdouble8& a, const double b) { return a = a >> b; }
////////////////////////////////////////////////////////////////////////////////
/// Comparison Operators + Select
////////////////////////////////////////////////////////////////////////////////
__forceinline vboold8 operator ==(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 operator ==(const vdouble8& a, double b) { return a == vdouble8(b); }
__forceinline vboold8 operator ==(double a, const vdouble8& b) { return vdouble8(a) == b; }
__forceinline vboold8 operator !=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
__forceinline vboold8 operator !=(const vdouble8& a, double b) { return a != vdouble8(b); }
__forceinline vboold8 operator !=(double a, const vdouble8& b) { return vdouble8(a) != b; }
__forceinline vboold8 operator < (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
__forceinline vboold8 operator < (const vdouble8& a, double b) { return a < vdouble8(b); }
__forceinline vboold8 operator < (double a, const vdouble8& b) { return vdouble8(a) < b; }
__forceinline vboold8 operator >=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
__forceinline vboold8 operator >=(const vdouble8& a, double b) { return a >= vdouble8(b); }
__forceinline vboold8 operator >=(double a, const vdouble8& b) { return vdouble8(a) >= b; }
__forceinline vboold8 operator > (const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
__forceinline vboold8 operator > (const vdouble8& a, double b) { return a > vdouble8(b); }
__forceinline vboold8 operator > (double a, const vdouble8& b) { return vdouble8(a) > b; }
__forceinline vboold8 operator <=(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
__forceinline vboold8 operator <=(const vdouble8& a, double b) { return a <= vdouble8(b); }
__forceinline vboold8 operator <=(double a, const vdouble8& b) { return vdouble8(a) <= b; }
__forceinline vboold8 eq(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 ne(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_NE); }
__forceinline vboold8 lt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LT); }
__forceinline vboold8 ge(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GE); }
__forceinline vboold8 gt(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_GT); }
__forceinline vboold8 le(const vdouble8& a, const vdouble8& b) { return _mm512_cmp_pd_mask(a,b,_MM_CMPINT_LE); }
__forceinline vboold8 eq(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_EQ); }
__forceinline vboold8 ne(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_NE); }
__forceinline vboold8 lt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LT); }
__forceinline vboold8 ge(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GE); }
__forceinline vboold8 gt(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_GT); }
__forceinline vboold8 le(const vboold8 mask, const vdouble8& a, const vdouble8& b) { return _mm512_mask_cmp_pd_mask(mask,a,b,_MM_CMPINT_LE); }
__forceinline vdouble8 select(const vboold8& m, const vdouble8& t, const vdouble8& f) {
return _mm512_mask_or_pd(f,m,t,t);
}
////////////////////////////////////////////////////////////////////////////////
// Movement/Shifting/Shuffling Functions
////////////////////////////////////////////////////////////////////////////////
template<int i0, int i1>
__forceinline vdouble8 shuffle(const vdouble8& v) {
return _mm512_permute_pd(v, (i1 << 7) | (i0 << 6) | (i1 << 5) | (i0 << 4) | (i1 << 3) | (i0 << 2) | (i1 << 1) | i0);
}
template<int i>
__forceinline vdouble8 shuffle(const vdouble8& v) {
return shuffle<i, i>(v);
}
template<int i0, int i1, int i2, int i3>
__forceinline vdouble8 shuffle(const vdouble8& v) {
return _mm512_permutex_pd(v, _MM_SHUFFLE(i3, i2, i1, i0));
}
template<int i0, int i1>
__forceinline vdouble8 shuffle4(const vdouble8& v) {
return _mm512_shuffle_f64x2(v, v, _MM_SHUFFLE(i1*2+1, i1*2, i0*2+1, i0*2));
}
template<int i>
__forceinline vdouble8 shuffle4(const vdouble8& v) {
return shuffle4<i, i>(v);
}
template<int i>
__forceinline vdouble8 align_shift_right(const vdouble8& a, const vdouble8& b) {
return _mm512_castsi512_pd(_mm512_alignr_epi64(_mm512_castpd_si512(a), _mm512_castpd_si512(b), i));
}
__forceinline double toScalar(const vdouble8& v) {
return _mm_cvtsd_f64(_mm512_castpd512_pd128(v));
}
////////////////////////////////////////////////////////////////////////////////
/// Reductions
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 vreduce_add2(vdouble8 x) { return x + shuffle<1,0,3,2>(x); }
__forceinline vdouble8 vreduce_add4(vdouble8 x) { x = vreduce_add2(x); return x + shuffle<2,3,0,1>(x); }
__forceinline vdouble8 vreduce_add (vdouble8 x) { x = vreduce_add4(x); return x + shuffle4<1,0>(x); }
__forceinline vdouble8 vreduce_min2(vdouble8 x) { return min(x, shuffle<1,0,3,2>(x)); }
__forceinline vdouble8 vreduce_min4(vdouble8 x) { x = vreduce_min2(x); return min(x, shuffle<2,3,0,1>(x)); }
__forceinline vdouble8 vreduce_min (vdouble8 x) { x = vreduce_min4(x); return min(x, shuffle4<1,0>(x)); }
__forceinline vdouble8 vreduce_max2(vdouble8 x) { return max(x, shuffle<1,0,3,2>(x)); }
__forceinline vdouble8 vreduce_max4(vdouble8 x) { x = vreduce_max2(x); return max(x, shuffle<2,3,0,1>(x)); }
__forceinline vdouble8 vreduce_max (vdouble8 x) { x = vreduce_max4(x); return max(x, shuffle4<1,0>(x)); }
__forceinline double reduce_add(const vdouble8& v) { return toScalar(vreduce_add(v)); }
__forceinline double reduce_min(const vdouble8& v) { return toScalar(vreduce_min(v)); }
__forceinline double reduce_max(const vdouble8& v) { return toScalar(vreduce_max(v)); }
////////////////////////////////////////////////////////////////////////////////
/// Memory load and store operations
////////////////////////////////////////////////////////////////////////////////
__forceinline vdouble8 permute(const vdouble8& v, const vllong8& index) {
return _mm512_permutexvar_pd(index, v);
}
__forceinline vdouble8 reverse(const vdouble8& a) {
return permute(a, vllong8(reverse_step));
}
////////////////////////////////////////////////////////////////////////////////
/// Output Operators
////////////////////////////////////////////////////////////////////////////////
__forceinline embree_ostream operator <<(embree_ostream cout, const vdouble8& v)
{
cout << "<" << v[0];
for (size_t i=1; i<8; i++) cout << ", " << v[i];
cout << ">";
return cout;
}
}
#undef vboolf
#undef vboold
#undef vint
#undef vuint
#undef vllong
#undef vfloat
#undef vdouble
|