diff options
author | Juan Linietsky <reduzio@gmail.com> | 2020-05-01 09:34:23 -0300 |
---|---|---|
committer | Juan Linietsky <reduzio@gmail.com> | 2020-05-10 15:59:09 -0300 |
commit | 1bea8e1eacc68bcedbd3f207395bccf11011dae2 (patch) | |
tree | b75303a69491978c1e13360a3e6f355c5234dfe0 /thirdparty/oidn/mkl-dnn/src/cpu/jit_generator.hpp | |
parent | 6a0473bcc23c096ef9ee929632a209761c2668f6 (diff) |
New lightmapper
-Added LocalVector (needed it)
-Added stb_rect_pack (It's pretty cool, we could probably use it for other stuff too)
-Fixes and changes all around the place
-Added library for 128 bits fixed point (required for Delaunay3D)
Diffstat (limited to 'thirdparty/oidn/mkl-dnn/src/cpu/jit_generator.hpp')
-rw-r--r-- | thirdparty/oidn/mkl-dnn/src/cpu/jit_generator.hpp | 773 |
1 files changed, 773 insertions, 0 deletions
diff --git a/thirdparty/oidn/mkl-dnn/src/cpu/jit_generator.hpp b/thirdparty/oidn/mkl-dnn/src/cpu/jit_generator.hpp new file mode 100644 index 0000000000..c09592d5c9 --- /dev/null +++ b/thirdparty/oidn/mkl-dnn/src/cpu/jit_generator.hpp @@ -0,0 +1,773 @@ +/******************************************************************************* +* Copyright 2016-2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#ifndef CPU_JIT_AVX2_GENERATOR_HPP +#define CPU_JIT_AVX2_GENERATOR_HPP + +#include <limits.h> + +#include "mkldnn_thread.hpp" +#include "utils.hpp" + +#include "cpu_isa_traits.hpp" +#include "jit_utils/jit_utils.hpp" + +#if defined(_WIN32) && !defined(__GNUC__) +# define STRUCT_ALIGN(al, ...) __declspec(align(al)) __VA_ARGS__ +#else +# define STRUCT_ALIGN(al, ...) __VA_ARGS__ __attribute__((__aligned__(al))) +#endif + +#if defined(_WIN32) +# define OFFSET_SHADOWSPACE 0x28 +#endif + +#define DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_name) \ + const char *name() const override { return STRINGIFY(jit_name); } \ + const char *source_file() const override { return __FILE__; } + +namespace mkldnn { +namespace impl { +namespace cpu { + +// TODO: move this to jit_generator class? +namespace { + +typedef enum { + PAGE_4K = 4096, + PAGE_2M = 2097152, +} cpu_page_size_t; + +// TODO: move this somewhere else? Although this is only used by jit kernels +// (Roma) +static inline int float2int(float x) { + union { + float vfloat; + int vint; + } cvt; + cvt.vfloat = x; + return cvt.vint; +} + +// TODO: A GPR class that hides ABI details from the JIT kernels and allows +// numbering registers from 0 to 14 (x86_64) / 6 (x32) (gpr0, gpr1, ...) and +// stack register (sr). +// +// This will allow using syntax like this: +// +// param = gpr0; +// reg_input = gpr0; +// reg_output = gpr1; +// ... +// +// #ifndef XBYAK64 +// mov(param, ptr[sr]) +// #endif +// +// (Roma) + +#ifdef XBYAK64 +constexpr Xbyak::Operand::Code abi_save_gpr_regs[] = { + Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12, + Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15, +#ifdef _WIN32 + Xbyak::Operand::RDI, Xbyak::Operand::RSI, +#endif +}; + +#ifdef _WIN32 +static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RCX), + abi_param2(Xbyak::Operand::RDX), + abi_param3(Xbyak::Operand::R8), + abi_param4(Xbyak::Operand::R9), + abi_not_param1(Xbyak::Operand::RDI); +#else +static const Xbyak::Reg64 abi_param1(Xbyak::Operand::RDI), + abi_param2(Xbyak::Operand::RSI), + abi_param3(Xbyak::Operand::RDX), + abi_param4(Xbyak::Operand::RCX), + abi_param5(Xbyak::Operand::R8), + abi_param6(Xbyak::Operand::R9), + abi_not_param1(Xbyak::Operand::RCX); +#endif +#endif + +inline unsigned int get_cache_size(int level, bool per_core = true){ + unsigned int l = level - 1; + // Currently, if XByak is not able to fetch the cache topology + // we default to 32KB of L1, 512KB of L2 and 1MB of L3 per core. + if (cpu.getDataCacheLevels() == 0){ + const int L1_cache_per_core = 32000; + const int L2_cache_per_core = 512000; + const int L3_cache_per_core = 1024000; + int num_cores = per_core ? 1 : mkldnn_get_max_threads(); + switch(l){ + case(0): return L1_cache_per_core * num_cores; + case(1): return L2_cache_per_core * num_cores; + case(2): return L3_cache_per_core * num_cores; + default: return 0; + } + } + if (l < cpu.getDataCacheLevels()) { + return cpu.getDataCacheSize(l) + / (per_core ? cpu.getCoresSharingDataCache(l) : 1); + } else + return 0; +} + +} + +class jit_generator : public Xbyak::CodeGenerator +{ +private: + const size_t xmm_len = 16; +#ifdef _WIN32 + const size_t xmm_to_preserve_start = 6; + const size_t xmm_to_preserve = 10; +#else + const size_t xmm_to_preserve_start = 0; + const size_t xmm_to_preserve = 0; +#endif + + const size_t num_abi_save_gpr_regs + = sizeof(abi_save_gpr_regs) / sizeof(abi_save_gpr_regs[0]); + + const size_t size_of_abi_save_regs + = num_abi_save_gpr_regs * rax.getBit() / 8 + + xmm_to_preserve * xmm_len; + +public: + enum { + _cmp_eq_oq = 0u, + _cmp_lt_os = 1u, + _cmp_le_os = 2u, + _cmp_neq_uq = 4u, + _cmp_nlt_us = 5u, + _cmp_nle_us = 6u, + + _op_floor = 1u, + _op_mxcsr = 4u, + }; + + Xbyak::Reg64 param1 = abi_param1; + const int EVEX_max_8b_offt = 0x200; + const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp; + + inline size_t get_size_of_abi_save_regs() { + return size_of_abi_save_regs; + } + + void preamble() { + if (xmm_to_preserve) { + sub(rsp, xmm_to_preserve * xmm_len); + for (size_t i = 0; i < xmm_to_preserve; ++i) + movdqu(ptr[rsp + i * xmm_len], Xbyak::Xmm(xmm_to_preserve_start + i)); + } + for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) + push(Xbyak::Reg64(abi_save_gpr_regs[i])); + if (mayiuse(avx512_common)) { + mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); + } + } + + void mic_prefetcht0(Xbyak::Address a) { + if (mayiuse(avx512_mic)) + prefetcht0(a); + } + + void mic_prefetcht1(Xbyak::Address a) { + if (mayiuse(avx512_mic)) + prefetcht1(a); + } + + void mic_prefetcht2(Xbyak::Address a) { + if (mayiuse(avx512_mic)) + prefetcht2(a); + } + + void uni_vzeroupper() { + if (mayiuse(avx) && !mayiuse(avx512_mic)) + vzeroupper(); + } + + void postamble() { + for (size_t i = 0; i < num_abi_save_gpr_regs; ++i) + pop(Xbyak::Reg64(abi_save_gpr_regs[num_abi_save_gpr_regs - 1 - i])); + if (xmm_to_preserve) { + for (size_t i = 0; i < xmm_to_preserve; ++i) + movdqu(Xbyak::Xmm(xmm_to_preserve_start + i), ptr[rsp + i * xmm_len]); + add(rsp, xmm_to_preserve * xmm_len); + } + uni_vzeroupper(); + ret(); + } + + template<typename T> + Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, + T raw_offt, bool bcast = false) + { + using Xbyak::Zmm; + using Xbyak::Reg64; + using Xbyak::Address; + using Xbyak::RegExp; + + assert(raw_offt <= INT_MAX); + auto offt = static_cast<int>(raw_offt); + + int scale = 0; + + if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) { + offt = offt - 2 * EVEX_max_8b_offt; + scale = 1; + } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) { + offt = offt - 4 * EVEX_max_8b_offt; + scale = 2; + } + + auto re = RegExp() + base + offt; + if (scale) + re = re + reg_EVEX_max_8b_offt * scale; + + if (bcast) + return zword_b [re]; + else + return zword [re]; + } + + Xbyak::Address make_safe_addr(const Xbyak::Reg64 ®_out, size_t offt, + const Xbyak::Reg64 &tmp_reg, bool bcast = false) { + if (offt > INT_MAX) { + mov(tmp_reg, offt); + return bcast ? ptr_b[reg_out + tmp_reg] : ptr[reg_out + tmp_reg]; + } else { + return bcast ? ptr_b[reg_out + offt] : ptr[reg_out + offt]; + } + } + + Xbyak::Address EVEX_compress_addr_safe(const Xbyak::Reg64 &base, + size_t raw_offt, const Xbyak::Reg64 ®_offt, bool bcast = false) { + if (raw_offt > INT_MAX) { + return make_safe_addr(base, raw_offt, reg_offt, bcast); + } else { + return EVEX_compress_addr(base, raw_offt, bcast); + } + } + + void safe_add(const Xbyak::Reg64 &base, size_t raw_offt, + const Xbyak::Reg64 ®_offt) { + if (raw_offt > INT_MAX) { + mov(reg_offt, raw_offt); + add(base, reg_offt); + } else { + add(base, raw_offt); + } + } + + void safe_sub(const Xbyak::Reg64 &base, size_t raw_offt, + const Xbyak::Reg64 ®_offt) { + if (raw_offt > INT_MAX) { + mov(reg_offt, raw_offt); + sub(base, reg_offt); + } else { + sub(base, raw_offt); + } + } + + // Disallow char-based labels completely + void L(const char *label) = delete; + void L(Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); } + + void uni_vpxor(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + assert(x1.getIdx() == x2.getIdx()); + pxor(x2, op); + } + void uni_vpxor(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + if (mayiuse(avx2)) { + vpxor(x1, x2, op); + } else { + vxorps(x1, x2, op); + } + } + void uni_vpxor(const Xbyak::Zmm &x1, const Xbyak::Zmm &x2, + const Xbyak::Operand &op) { + vpxord(x1, x2, op); + } + + void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Xmm &x) { + movss(addr, x); + } + void uni_vmovss(const Xbyak::Address& addr, const Xbyak::Ymm &x) { + vmovss(addr, x); + } + void uni_vmovss(const Xbyak::Xmm &x, const Xbyak::Address& addr) { + movss(x, addr); + } + void uni_vmovss(const Xbyak::Ymm &x, const Xbyak::Address& addr) { + vmovss(x, addr); + } + + void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Xmm &x) { + movsd(addr, x); + } + void uni_vmovsd(const Xbyak::Address& addr, const Xbyak::Ymm &x) { + vmovsd(addr, x); + } + void uni_vmovsd(const Xbyak::Xmm &x, const Xbyak::Address& addr) { + movsd(x, addr); + } + void uni_vmovsd(const Xbyak::Ymm &x, const Xbyak::Address& addr) { + vmovsd(x, addr); + } + + void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Xmm &x) { + movdqu(addr, x); + } + void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Ymm &x) { + vmovdqu(addr, x); + } + void uni_vmovdqu(const Xbyak::Address &addr, const Xbyak::Zmm &x) { + vmovdqu32(addr, x); + } + + void uni_vmovdqu(const Xbyak::Xmm &x, const Xbyak::Address &addr) { + movdqu(x, addr); + } + void uni_vmovdqu(const Xbyak::Ymm &x, const Xbyak::Address &addr) { + vmovdqu(x, addr); + } + void uni_vmovdqu(const Xbyak::Zmm &x, const Xbyak::Address &addr) { + vmovdqu32(x, addr); + } + + void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Xmm &x) { + movups(addr, x); + } + void uni_vmovups(const Xbyak::Address &addr, const Xbyak::Ymm &x) { + vmovups(addr, x); + } + + void uni_vmovups(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + movups(x, op); + } + void uni_vmovups(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + vmovups(x, op); + } + + void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Xmm &x) { + movntps(addr, x); + } + void uni_vmovntps(const Xbyak::Address &addr, const Xbyak::Ymm &x) { + vmovntps(addr, x); + } + + void uni_vbroadcastss(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + movss(x, op); + shufps(x, x, 0x0); + } + void uni_vbroadcastss(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + if (op.isMEM() || mayiuse(avx2)) { + vbroadcastss(x, op); + } else { + Xbyak::Xmm t(x.getIdx()); + if (t.getIdx() != op.getIdx()) movss(t, op); + vinsertf128(x, x, t, 1); + vshufps(x, x, x, 0); + } + } + + void uni_vpbroadcastd(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + movsd(x, op); + pshufd(x, x, 0x0); + } + void uni_vpbroadcastd(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + if (mayiuse(avx2)) { + vpbroadcastd(x, op); + } else { + Xbyak::Xmm t(x.getIdx()); + if (t.getIdx() != op.getIdx()) movsd(t, op); + vinsertf128(x, x, t, 1); + vshufps(x, x, x, 0); + } + } + + void uni_vrcpss(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + rcpss(x, op); + } + void uni_vrcpss(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2) { + Xbyak::Xmm x1_(x1.getIdx()); + Xbyak::Xmm x2_(x2.getIdx()); + vrcpss(x1_, x1_, x2_); + } + void uni_vrcpss(const Xbyak::Ymm &x, const Xbyak::Address &op) { + Xbyak::Xmm x_(x.getIdx()); + vrcpss(x_, x_, op); + } + + void uni_vrcpps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + rcpps(x, op); + } + void uni_vrcpps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + vrcpps(x, op); + } + void uni_vrcpps(const Xbyak::Zmm &x, const Xbyak::Operand &op) { + vrcp14ps(x, op); + } + + void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + assert(x.getIdx() == op1.getIdx()); + divps(x, op2); + } + void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + vdivps(x, op1, op2); + } + + void uni_vdivps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2, const Xbyak::Xmm &buf) { + movups(buf, op1); + divps(buf, op2); + if (x.getIdx() != buf.getIdx()) { + movups(x, buf); + } + } + + void uni_vdivps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2, const Xbyak::Ymm &buf) { + vdivps(x, op1, op2); + } + + void uni_vaddps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + assert(x.getIdx() == op1.getIdx()); + addps(x, op2); + } + void uni_vaddps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + vaddps(x, op1, op2); + } + + void uni_vpsignd(const Xbyak::Xmm& x1, const Xbyak::Xmm& x2, + const Xbyak::Operand& op) { + assert(x1.getIdx() == x2.getIdx()); + psignd(x1, op); + } + void uni_vpsignd(const Xbyak::Ymm& x1, const Xbyak::Ymm& x2, + const Xbyak::Operand& op) { + vpsignd(x1, x2, op); + } + + void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + assert(x.getIdx() == op1.getIdx()); + subps(x, op2); + } + void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + vsubps(x, op1, op2); + } + + void uni_vsubps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2, const Xbyak::Xmm &buf) { + movups(buf, op1); + subps(buf, op2); + if (x.getIdx() != buf.getIdx()) { + movups(x, buf); + } + } + + void uni_vsubps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2, const Xbyak::Ymm &buf) { + vsubps(x, op1, op2); + } + + void uni_vmulps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + assert(x.getIdx() == op1.getIdx()); + mulps(x, op2); + } + void uni_vmulps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + vmulps(x, op1, op2); + } + + void uni_vfmadd213ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + mulps(x1, x2); + addps(x1, op); + } + void uni_vfmadd213ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + vfmadd213ps(x1, x2, op); + } + + void uni_vfmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + mulps(x2, op); + addps(x1, x2); + } + void uni_vfmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + vfmadd231ps(x1, x2, op); + } + + void uni_vfnmadd231ps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + mulps(x2, op); + subps(x1, x2); + } + + void uni_vfnmadd231ps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + vfnmadd231ps(x1, x2, op); + } + + void uni_vsqrtps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + sqrtps(x, op); + } + void uni_vsqrtps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + vsqrtps(x, op); + } + + void uni_vpaddd(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + assert(x1.getIdx() == x2.getIdx()); + paddd(x2, op); + } + void uni_vpaddd(const Xbyak::Ymm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + vpaddd(x1, x2, op); + } + + void uni_vandps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + assert(x1.getIdx() == x2.getIdx()); + andps(x1, op); + } + void uni_vandps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + if (!mayiuse(avx512_common) || x1.getBit() < 512) + vandps(x1, x2, op); + else + vpandd(x1, x2, op); + } + + void uni_vorps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + assert(x1.getIdx() == x2.getIdx()); + orps(x1, op); + } + void uni_vorps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op = Xbyak::Operand()) { + if (!mayiuse(avx512_common) || x1.getBit() < 512) + vorps(x1, x2, op); + else + vpord(x1, x2, op); + } + + void uni_vpslld(const Xbyak::Xmm &x, const Xbyak::Operand &op, + const int imm) { + assert(x.getIdx() == op.getIdx()); + pslld(x, imm); + } + void uni_vpslld(const Xbyak::Ymm &x, const Xbyak::Operand &op, + const int imm) { + vpslld(x, op, imm); + } + + void uni_vpsrld(const Xbyak::Xmm &x, const Xbyak::Operand &op, + const int imm) { + assert(x.getIdx() == op.getIdx()); + psrld(x, imm); + } + void uni_vpsrld(const Xbyak::Ymm &x, const Xbyak::Operand &op, + const int imm) { + vpsrld(x, op, imm); + } + + void uni_vmaxps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + assert(x.getIdx() == op1.getIdx()); + maxps(x, op2); + } + void uni_vmaxps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + vmaxps(x, op1, op2); + } + + void uni_vminps(const Xbyak::Xmm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + assert(x.getIdx() == op1.getIdx()); + minps(x, op2); + } + void uni_vminps(const Xbyak::Ymm &x, const Xbyak::Operand &op1, + const Xbyak::Operand &op2 = Xbyak::Operand()) { + vminps(x, op1, op2); + } + + void uni_vcmpgtps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + assert(x1.getIdx() == x2.getIdx()); + cmpps(x1, op, _cmp_nle_us); + } + + void uni_vcmpgtps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + vcmpgtps(x1, x2, op); + } + + void uni_vcmpgeps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op) { + assert(x1.getIdx() == x2.getIdx()); + cmpps(x1, op, _cmp_nlt_us); + } + + void uni_vcmpgeps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op) { + vcmpps(x1, x2, op, _cmp_nlt_us); + } + + void uni_vtestps(const Xbyak::Xmm &x1, const Xbyak::Operand &op) { + ptest(x1, op); + } + + void uni_vtestps(const Xbyak::Ymm &x1, const Xbyak::Operand &op) { + assert(!(x1.isZMM() || op.isZMM())); + vtestps(x1, op); + } + + void uni_vblendvps(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, + const Xbyak::Operand &op, const Xbyak::Xmm &msk) { + assert(x1.getIdx() == x2.getIdx()); + assert(msk.getIdx() == 0); + blendvps(x1, op); + } + void uni_vblendvps(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, + const Xbyak::Operand &op, const Xbyak::Ymm &msk) { + vblendvps(x1, x2, op, msk); + } + + void uni_vroundps(const Xbyak::Xmm &x, const Xbyak::Operand &op, + const int imm) { + roundps(x, op, imm); + } + void uni_vroundps(const Xbyak::Ymm &x, const Xbyak::Operand &op, + const int imm) { + vroundps(x, op, imm); + } + + void uni_vcvtps2dq(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + cvtps2dq(x, op); + } + void uni_vcvtps2dq(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + vcvtps2dq(x, op); + } + + void uni_vcvtdq2ps(const Xbyak::Xmm &x, const Xbyak::Operand &op) { + cvtdq2ps(x, op); + } + void uni_vcvtdq2ps(const Xbyak::Ymm &x, const Xbyak::Operand &op) { + vcvtdq2ps(x, op); + } + + void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Xmm &x2) { + movmskps(x1.cvt64(), x2); + } + void uni_vmovmskps(const Xbyak::Reg &x1, const Xbyak::Ymm &x2) { + vmovmskps(x1, x2); + } + + void uni_vpackssdw(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){ + assert(x1.getIdx() == x1.getIdx()); + packssdw(x1, op); + } + void uni_vpackssdw(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){ + vpackssdw(x1, x2, op); + } + + void uni_vpackuswb(const Xbyak::Xmm &x1, const Xbyak::Xmm &x2, const Xbyak::Operand &op){ + assert(x1.getIdx() == x1.getIdx()); + packuswb(x1, op); + } + void uni_vpackuswb(const Xbyak::Ymm &x1, const Xbyak::Ymm &x2, const Xbyak::Operand &op){ + vpackuswb(x1, x2, op); + } + + + void mul_by_const(const Xbyak::Reg &out, + const Xbyak::Reg64 &tmp, int value) { + // Generates a shift + add sequence for multiplicating contents of the + // out register by a known JIT-time value. Clobbers the tmp register. + // + // Pros compared to mul/imul: + // - does not require using known registers + // - not microcoded on Intel(R) Xeon Phi(TM) processors + // Still, there are probably a lot of cases when mul/imul is faster on + // Intel(R) Core(TM) processors. Not intended for critical path. + + // TODO: detect when overflow is emminent (Roma) + // TODO: detect when using mul/imul is a better option (Roma) + + int p = 0; // the current power of 2 + int old_p = 0; // the last seen power of 2 such that value[old_p] != 0 + + xor_(tmp, tmp); + while (value) { + if (value & 1) { + int shift = p - old_p; + if (shift) { + shl(out, shift); + old_p = p; + } + add(tmp, out); + } + value >>= 1; + p++; + } + mov(out, tmp); + } + +public: + jit_generator( + void *code_ptr = nullptr, + size_t code_size = 256 * 1024 + ) : Xbyak::CodeGenerator(code_size, code_ptr) + { + } + virtual ~jit_generator() {} + + virtual const char *name() const = 0; + virtual const char *source_file() const = 0; + + const Xbyak::uint8 *getCode() { + const Xbyak::uint8 *code = CodeGenerator::getCode(); + size_t code_size = getSize(); + jit_utils::register_jit_code(code, code_size, name(), source_file()); + return code; + } + + template<typename F> const F getCode() { + return (const F)getCode(); + } +}; + +} +} +} + +#endif |