diff options
Diffstat (limited to 'thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.cpp')
-rw-r--r-- | thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.cpp | 544 |
1 files changed, 544 insertions, 0 deletions
diff --git a/thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.cpp b/thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.cpp new file mode 100644 index 0000000000..1d41ac5cea --- /dev/null +++ b/thirdparty/oidn/mkl-dnn/src/cpu/cpu_reducer.cpp @@ -0,0 +1,544 @@ +/******************************************************************************* +* Copyright 2017-2018 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include <assert.h> + +#include "mkldnn_thread.hpp" +#include "mkldnn_types.h" +#include "nstl.hpp" +#include "utils.hpp" + +#include "cpu_reducer.hpp" + +namespace mkldnn { +namespace impl { +namespace cpu { + +using namespace memory_tracking::names; + +void reduce_balancer_t::balance() { + using namespace nstl; + using namespace utils; + + assert(nthr_ > 0 && job_size_ > 0 && njobs_ > 0 && reduction_size_ > 0); + + const int job_complexity = 1; + + const int min_njobs_per_group = max(1, njobs_ / nthr_); + const int max_njobs_per_group = max(1, + static_cast<int>(max_buffer_size_ / (nthr_ * job_size_))); + + /* initial guess */ + int ngroups = min(njobs_ / min_njobs_per_group, nthr_); + int nthr_per_group = syncable_ ? min(nthr_ / ngroups, reduction_size_) : 1; + int njobs_per_group_ub = div_up(njobs_, ngroups); + + /* rough upper-bound estimation, will be fixed during brute force */ + size_t thread_complexity_ub = njobs_ * job_size_ * reduction_size_; + + /* brute force parameters for the best balance... */ + for (int c_njobs_per_group = min_njobs_per_group; + c_njobs_per_group < njobs_; ++c_njobs_per_group) { + /* current assumption */ + int c_ngroups = min(njobs_ / c_njobs_per_group, nthr_); + int c_nthr_per_group = syncable_ + ? min(nthr_ / c_ngroups, reduction_size_) : 1; + int c_njobs_per_group_ub = div_up(njobs_, c_ngroups); + + if (c_nthr_per_group > 1 && c_njobs_per_group_ub > max_njobs_per_group) + continue; + + int c_thread_reduction_ub = div_up(reduction_size_, c_nthr_per_group); + size_t c_group_size_ub = job_size_ * c_njobs_per_group_ub; + size_t c_thread_complexity_ub = c_group_size_ub * ( + job_complexity * c_thread_reduction_ub + + (c_nthr_per_group != 1)); + + if (c_thread_complexity_ub < thread_complexity_ub) { + ngroups = c_ngroups; + nthr_per_group = c_nthr_per_group; + njobs_per_group_ub = c_njobs_per_group_ub; + thread_complexity_ub = c_thread_complexity_ub; + } + } + + assert(njobs_per_group_ub <= max_njobs_per_group || nthr_per_group == 1); + assert(ngroups * nthr_per_group <= nthr_); + assert((size_t)njobs_per_group_ub * job_size_ * nthr_ <= max_buffer_size_ + || nthr_per_group == 1); /* no reduction buffer overflow */ + assert(IMPLICATION(!syncable_, nthr_per_group == 1)); + + ngroups_ = ngroups; + nthr_per_group_ = nthr_per_group; + njobs_per_group_ub_ = njobs_per_group_ub; +} + +/* reducer jit-ted driver */ + +using namespace Xbyak; + +template <impl::data_type_t data_type> +struct reducer_2d_driver_t: public c_compatible { + typedef typename prec_traits<data_type>::type data_t; + + reducer_2d_driver_t(int n_src, size_t src_ld, + size_t src_step, size_t dst_step, bool nullify_dst) + : n_src_(n_src), src_ld_(src_ld), src_step_(src_step) + , dst_step_(dst_step), nullify_dst_(nullify_dst), ker_(nullptr) {} + virtual ~reducer_2d_driver_t() {} + void operator()(data_t *dst, const data_t *srcs, size_t ny, size_t nx) + { assert(ker_); ker_(dst, srcs, ny, nx); } + +protected: + int n_src_; + size_t src_ld_, src_step_, dst_step_; + bool nullify_dst_; + void (*ker_)(data_t *dst, const data_t *srcs, size_t ny, size_t nx); +}; + +template <impl::data_type_t data_type, cpu_isa_t isa> +struct reducer_2d_driver_f_s_32_t: public reducer_2d_driver_t<data_type>, + public jit_generator +{ + DECLARE_CPU_JIT_AUX_FUNCTIONS(reducer_2d_driver_f_s_32_t) + + /* cpu specific part */ + using Vmm = typename utils::conditional<isa == avx2, Ymm, Zmm>::type; + const AddressFrame &vmmword = (isa == avx2) ? yword : zword; + void uni_vadd(const Xmm& x1, const Xmm& x2, const Operand& op) + { if (data_type == data_type::f32) vaddps(x1, x2, op); + else vpaddd(x1, x2, op); } + void uni_add(const Xmm& x1, const Operand& op) + { if (data_type == data_type::f32) addss(x1, op); else paddd(x1, op); } + + const int vlen = cpu_isa_traits<isa>::vlen; + const int typesize + = sizeof(typename mkldnn::impl::prec_traits<data_type>::type); + Xbyak::Reg64 reg_dst = abi_param1; + Xbyak::Reg64 reg_src = abi_param2; + Xbyak::Reg64 reg_ny = abi_param3; + Xbyak::Reg64 reg_nx = abi_param4; + + Xbyak::Reg64 reg_x = rax; + Xbyak::Reg64 reg_src_id = r10; + + reducer_2d_driver_f_s_32_t(int n_src, size_t src_ld, size_t src_step, + size_t dst_step, bool nullify_dst) + : reducer_2d_driver_t<data_type>(n_src, src_ld, src_step, + dst_step, nullify_dst) + { generate(); } + + void nullify_dst(int nloads, int load_len) { + UNUSED(load_len); + for (int i = 0; i < nloads; ++i) + uni_vpxor(Vmm(i), Vmm(i), Vmm(i)); + /* prefetches[dst] ? */ + } + + void load_dst(int nloads, int load_len) { + for (int i = 0; i < nloads; ++i) { + if (load_len == typesize) + movd(Xmm(i), ptr[reg_dst + i * load_len]); + else if (load_len == vlen) + vmovups(Vmm(i), ptr[reg_dst + i * load_len]); + else + assert(!"unsupported"); + } + } + + void store_dst(int nloads, int load_len) { + for (int i = 0; i < nloads; ++i) { + if (load_len == typesize) + movd(ptr[reg_dst + i * load_len], Xmm(i)); + else if (load_len == vlen) + vmovups(ptr[reg_dst + i * load_len], Vmm(i)); + else + assert(!"unsupported"); + } + } + + void accumulate(int nloads, int load_len, size_t base_off) { + for (int i = 0; i < nloads; ++i) { + size_t off = base_off + i * load_len; + + if (load_len == typesize) + uni_add(Xmm(i), ptr[reg_src + off]); + else if (load_len == vlen) + uni_vadd(Vmm(i), Vmm(i), vmmword[reg_src + off]); + else + assert(!"unsupported"); + } + } + + void loop_x() { + const int nloads[] = {cpu_isa_traits<isa>::n_vregs, 1, 1}; + const int nbranches = sizeof(nloads) / sizeof(nloads[0]); + + const int load_len[nbranches] = {vlen, vlen, typesize}; + Label loop_x_label[nbranches + 1]; + + mov(reg_x, reg_nx); + + for (int id = 0; id < nbranches; ++id) { + L(loop_x_label[id]); + + cmp(reg_x, nloads[id] * load_len[id]); + jl(loop_x_label[id + 1], T_NEAR); + + if (this->nullify_dst_) + nullify_dst(nloads[id], load_len[id]); + else + load_dst(nloads[id], load_len[id]); + + if (nloads[id] > 1) { + Label loop_srcs; + mov(reg_src_id, this->n_src_); + L(loop_srcs); + + accumulate(nloads[id], load_len[id], 0); + add(reg_src, this->src_ld_ * typesize); + + dec(reg_src_id); + jnz(loop_srcs, T_NEAR); + + sub(reg_src, this->n_src_ * this->src_ld_ * typesize); + } else { + for (int src_id = 0; src_id < this->n_src_; ++src_id) { + const size_t base_off = src_id * this->src_ld_ * typesize; + accumulate(nloads[id], load_len[id], base_off); + } + } + + store_dst(nloads[id], load_len[id]); + + add(reg_src, nloads[id] * load_len[id]); + add(reg_dst, nloads[id] * load_len[id]); + + sub(reg_x, nloads[id] * load_len[id]); + + jmp(loop_x_label[id], T_NEAR); + } + + L(loop_x_label[nbranches]); + + /* restore address registers */ + sub(reg_src, reg_nx); + sub(reg_dst, reg_nx); + } + + void generate() { + assert(isa == avx2 || isa == avx512_common || isa == avx512_mic); + + preamble(); + + shl(reg_nx, 2); + + Label ny_loop; + L(ny_loop); + + loop_x(); + + add(reg_dst, this->dst_step_ * typesize); + add(reg_src, this->src_step_ * typesize); + + dec(reg_ny); + jnz(ny_loop, T_NEAR); + + postamble(); + this->ker_ = reinterpret_cast<decltype(this->ker_)>( + const_cast<uint8_t*>(this->getCode())); + } +}; + +template <impl::data_type_t data_type> +inline reducer_2d_driver_t<data_type> *create_reduce_2d_drv(int n_src, + size_t src_ld, size_t src_step, size_t dst_step, bool nullify_dst) { + if (mayiuse(avx512_common)) + return new reducer_2d_driver_f_s_32_t<data_type, avx512_common>(n_src, + src_ld, src_step, dst_step, nullify_dst); + else if (mayiuse(avx2)) + return new reducer_2d_driver_f_s_32_t<data_type, avx2>(n_src, src_ld, + src_step, dst_step, nullify_dst); + assert(!"unimplemented"); + return nullptr; +} + +/* cpu_reducer_t */ + +template <impl::data_type_t data_type> +void cpu_reducer_t<data_type>::conf_t::init_scratchpad( + memory_tracking::registrar_t &scratchpad) const { + if (balancer_.nthr_per_group_ == 1) return; + + const size_t space_size = balancer_.ngroups_ + * (balancer_.nthr_per_group_ - 1) + * cpu_reducer_t<data_type>::space_per_thread(balancer_); + scratchpad.book(key_reducer_space, sizeof(data_t) * space_size, PAGE_4K); + scratchpad.book(key_reducer_space_bctx, + sizeof(simple_barrier::ctx_t) * balancer_.ngroups_); +} + +template <impl::data_type_t data_type> +cpu_reducer_t<data_type>::cpu_reducer_t(const conf_t &conf) + : conf_(conf), drv_(nullptr) +{ + if (balancer().nthr_per_group_ == 1) return; + + drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_ - 1, + space_per_thread(balancer()), 0, 0, false); +} + +template <impl::data_type_t data_type> +cpu_reducer_t<data_type>::~cpu_reducer_t() { delete drv_; } + +template <impl::data_type_t data_type> +typename cpu_reducer_t<data_type>::data_t * +cpu_reducer_t<data_type>::get_local_ptr(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + const int id_in_grp = balancer().id_in_group(ithr); + + /* threads 0 from each group writes directly to the destination */ + if (id_in_grp == 0) + return dst + balancer().ithr_job_off(ithr) * balancer().job_size_; + + const int grp_id = balancer().group_id(ithr); + const int offset_factor = grp_id * (balancer().nthr_per_group_ - 1) + + (id_in_grp - 1); + + auto space = scratchpad.template get<data_t>(key_reducer_space); + return space + offset_factor * space_per_thread(balancer()); +} + +template <impl::data_type_t data_type> +void cpu_reducer_t<data_type>::reduce_nolock(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + bool redundant_reduction = balancer().nthr_per_group_ == 1 + || balancer().idle(ithr); + if (redundant_reduction) return; + +#ifdef SIMPLE_IMPL + if (balancer().id_in_group(ithr) != 0) + return; /* only threads 0 do the reduction */ + + const int njobs_in_grp = balancer().ithr_njobs(ithr); + data_t *d = get_local_ptr(ithr, dst, scratchpad); + for (int id_in_grp = 1; id_in_grp < balancer_.nthr_per_group_; ++id_in_grp) + { + const data_t *space = get_local_ptr(ithr + id_in_grp, dst, scratchpad); + for (size_t i = 0; i < (size_t)njobs_in_grp * balancer().job_size_; ++i) + d[i] += space[i]; + } +#else + using namespace utils; + + const int id_in_grp = balancer().id_in_group(ithr); + const int njobs_in_grp = balancer().ithr_njobs(ithr); + const size_t cl = 64 / sizeof(data_t); + + const size_t reduction_size = njobs_in_grp * balancer().job_size_; + size_t start{0}, end{0}; + balance211(div_up(reduction_size, cl), balancer().nthr_per_group_, + id_in_grp, start, end); + + if (start == end) return; + + data_t *d = get_local_ptr(ithr - id_in_grp, dst, scratchpad) + start * cl; + const data_t *space = get_local_ptr(ithr - id_in_grp + 1, dst, scratchpad) + + start * cl; + const size_t len = nstl::min(end * cl, reduction_size) - start * cl; + + (*drv_)(d, space, 1, len); +#endif +} + +template struct cpu_reducer_t<data_type::f32>; +template struct cpu_reducer_t<data_type::s32>; + +/* cpu_reducer_2d_t */ + +template <impl::data_type_t data_type> +void cpu_reducer_2d_t<data_type>::conf_t::init_scratchpad( + memory_tracking::registrar_t &scratchpad) const { + if (balancer_.nthr_per_group_ == 1) return; + + const size_t space_size = balancer_.ngroups_ * balancer_.nthr_per_group_ + * cpu_reducer_2d_t<data_type>::space_per_thread(balancer_); + scratchpad.book(key_reducer_space, sizeof(data_t) * space_size); + scratchpad.book(key_reducer_space_bctx, + sizeof(simple_barrier::ctx_t) * balancer_.ngroups_); +} + +template <impl::data_type_t data_type> +cpu_reducer_2d_t<data_type>::cpu_reducer_2d_t(const conf_t &conf) + : conf_(conf), drv_(nullptr) +{ + if (balancer().nthr_per_group_ == 1) return; + + drv_ = create_reduce_2d_drv<data_type>(balancer().nthr_per_group_, + space_per_thread(balancer()), conf_.job_size_x_, conf_.dst_x_, + true); +} + +template <impl::data_type_t data_type> +cpu_reducer_2d_t<data_type>::~cpu_reducer_2d_t() { delete drv_; } + +template <impl::data_type_t data_type> +typename cpu_reducer_2d_t<data_type>::data_t *cpu_reducer_2d_t<data_type>:: +get_local_ptr(int ithr, const memory_tracking::grantor_t &scratchpad) const { + const int id_in_grp = balancer().id_in_group(ithr); + const int grp_id = balancer().group_id(ithr); + const int offset_factor = grp_id * balancer().nthr_per_group_ + id_in_grp; + auto space = scratchpad.template get<data_t>(key_reducer_space); + return space + offset_factor * space_per_thread(balancer()); +} + +template <impl::data_type_t data_type> +int cpu_reducer_2d_t<data_type>::choose_x_blocking(int nx, int ny, + int nthr_per_grp) const { + // find x_blocking for better balance reducing work between threads + assert(conf_.x_block_ > 0 && nx > conf_.x_block_ + && nx % conf_.x_block_ == 0); + int x_blocking = nx / conf_.x_block_; + int min_x_blocking = + utils::div_up(x_blocking, nstl::max(1, nthr_per_grp / ny)); + while (true) { + if (x_blocking % 2 == 0 && x_blocking >= min_x_blocking * 2) + x_blocking /= 2; + else if (x_blocking % 3 == 0 && x_blocking >= min_x_blocking * 3) + x_blocking /= 3; + else + break; + } + if (x_blocking >= min_x_blocking * 4) x_blocking = 1; + x_blocking *= conf_.x_block_; + return x_blocking; +} + +template <impl::data_type_t data_type> +void cpu_reducer_2d_t<data_type>::reduce_block(const data_t* space_base, + data_t *dst, int job, int start_y, int start_x, + int ny_start, int nx_start, int ny_step, int nx_step) const { + data_t *d = dst + (start_y + ny_start) * conf_.dst_x_ + + start_x + nx_start; + const data_t *space = space_base + job * balancer().job_size_ + + ny_start * conf_.job_size_x_ + nx_start; +#ifdef SIMPLE_IMPL + for (int idg = 0; idg < balancer().nthr_per_group_; ++idg) { + const data_t *w = &space[idg * space_per_thread(balancer())]; + for (int y = 0; y < ny_step; ++y) + for (int x = 0; x < nx_step; ++x) { + d[y * conf_.dst_x_ + x] + = (idg == 0 ? 0 : d[y * conf_.dst_x_ + x]) + + w[y * conf_.job_size_x_ + x]; + } + } +#else + (*drv_)(d, space, ny_step, nx_step); +#endif +} + +template <impl::data_type_t data_type> +void cpu_reducer_2d_t<data_type>::reduce_nolock(int ithr, data_t *dst, + const memory_tracking::grantor_t &scratchpad) const { + bool redundant_reduction = balancer().nthr_per_group_ == 1 + || balancer().idle(ithr); + if (redundant_reduction) return; + + const int id_in_grp = balancer().id_in_group(ithr); + const int njobs_in_grp = balancer().ithr_njobs(ithr); + const int njobs_x = utils::div_up(conf_.dst_x_, conf_.job_size_x_); + const int global_job_start = balancer().ithr_job_off(ithr); + + const data_t *space_base = get_local_ptr(ithr - id_in_grp, scratchpad); + + const int pr_grps = nstl::min(njobs_in_grp, balancer().nthr_per_group_); + const int pr_nthr_per_grp = balancer().nthr_per_group_ / pr_grps; + + if (id_in_grp >= pr_grps * pr_nthr_per_grp) + return; /* idle */ + + const int pr_my_grp = id_in_grp / pr_nthr_per_grp; + const int pr_my_id = id_in_grp % pr_nthr_per_grp; + + int pr_job_start{0}, pr_job_end{0}; + balance211(njobs_in_grp, pr_grps, pr_my_grp, pr_job_start, pr_job_end); + + for (int j = pr_job_start; j < pr_job_end; ++j) { + const int global_job = global_job_start + j; + const int j_y = global_job / njobs_x; + const int j_x = global_job % njobs_x; + const int start_y = j_y * conf_.job_size_y_; + const int start_x = j_x * conf_.job_size_x_; + const int ny = nstl::min(conf_.dst_y_ - start_y, conf_.job_size_y_); + const int nx = nstl::min(conf_.dst_x_ - start_x, conf_.job_size_x_); + int x_blocking = choose_x_blocking(nx, ny, pr_nthr_per_grp); + + int nxy_start{0}, nxy_end{0}; + balance211(ny * nx / x_blocking, pr_nthr_per_grp, pr_my_id, + nxy_start, nxy_end); + if (nxy_start == nxy_end) continue; + nxy_start *= x_blocking; + nxy_end *= x_blocking; + + int nxy = nxy_start; + if (nxy % nx != 0) { + int nx_step = nstl::min(nx - nxy % nx, nxy_end - nxy); + reduce_block(space_base, dst, j, start_y, start_x, + nxy / nx, nxy % nx, 1, nx_step); + nxy += nx_step; + } + if ((nxy_end - nxy) > nx) { + int ny_step = (nxy_end - nxy) / nx; + reduce_block(space_base, dst, j, start_y, start_x, + nxy / nx, nxy % nx, ny_step, nx); + nxy += nx * ny_step; + } + if ((nxy_end - nxy) > 0) { + reduce_block(space_base, dst, j, start_y, start_x, + nxy / nx, nxy % nx, 1, nxy_end - nxy); + } + } +} + +template struct cpu_reducer_2d_t<data_type::f32>; +template struct cpu_reducer_2d_t<data_type::s32>; + +/* accumulator section */ + +template <impl::data_type_t data_type> +cpu_accumulator_1d_t<data_type>::cpu_accumulator_1d_t(): drv_(nullptr) { + drv_ = create_reduce_2d_drv<data_type>(1, 0, 0, 0, false); +} + +template <impl::data_type_t data_type> +cpu_accumulator_1d_t<data_type>::~cpu_accumulator_1d_t() { + delete drv_; +} + +template <impl::data_type_t data_type> +void cpu_accumulator_1d_t<data_type>::accumulate(data_t *dst, + const data_t *src, size_t size) { + (*drv_)(dst, src, 1, size); +} + +template struct cpu_accumulator_1d_t<data_type::f32>; +template struct cpu_accumulator_1d_t<data_type::s32>; + +} +} +} + +// vim: et ts=4 sw=4 cindent cino^=l0,\:0,N-s |