1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
// Copyright 2009-2020 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
#pragma once
/* Make precision match SSE, at the cost of some performance */
#if !defined(__aarch64__)
# define SSE2NEON_PRECISE_DIV 1
# define SSE2NEON_PRECISE_SQRT 1
#endif
#include "sse2neon.h"
__forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) {
__m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c)));
return _mm_fmadd_ps(a, b, neg_c);
}
__forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) {
#if defined(__aarch64__)
return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c),
vreinterpretq_f32_m128(b),
vreinterpretq_f32_m128(a)));
#else
return _mm_sub_ps(c, _mm_mul_ps(a, b));
#endif
}
__forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) {
return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c))));
}
/* Dummy defines for floating point control */
#define _MM_MASK_MASK 0x1f80
#define _MM_MASK_DIV_ZERO 0x200
#define _MM_FLUSH_ZERO_ON 0x8000
#define _MM_MASK_DENORM 0x100
#define _MM_SET_EXCEPTION_MASK(x)
#define _MM_SET_FLUSH_ZERO_MODE(x)
__forceinline int _mm_getcsr()
{
return 0;
}
__forceinline void _mm_mfence()
{
__sync_synchronize();
}
|